blob: 681bfe3b5831f448c8fea606aa73a0af880dbff7 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Guido van Rossumd57fd912000-03-10 22:53:23 +000049/* Endianness switches; defaults to little endian */
50
51#ifdef WORDS_BIGENDIAN
52# define BYTEORDER_IS_BIG_ENDIAN
53#else
54# define BYTEORDER_IS_LITTLE_ENDIAN
55#endif
56
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000057/* --- Globals ------------------------------------------------------------
58
59 The globals are initialized by the _PyUnicode_Init() API and should
60 not be used before calling that API.
61
62*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000063
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000064
65#ifdef __cplusplus
66extern "C" {
67#endif
68
Victor Stinner8faf8212011-12-08 22:14:11 +010069/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
70#define MAX_UNICODE 0x10ffff
71
Victor Stinner910337b2011-10-03 03:20:16 +020072#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020073# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020074#else
75# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
76#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020077
Victor Stinnere90fe6a2011-10-01 16:48:13 +020078#define _PyUnicode_UTF8(op) \
79 (((PyCompactUnicodeObject*)(op))->utf8)
80#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020081 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020082 assert(PyUnicode_IS_READY(op)), \
83 PyUnicode_IS_COMPACT_ASCII(op) ? \
84 ((char*)((PyASCIIObject*)(op) + 1)) : \
85 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020086#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020087 (((PyCompactUnicodeObject*)(op))->utf8_length)
88#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020089 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020090 assert(PyUnicode_IS_READY(op)), \
91 PyUnicode_IS_COMPACT_ASCII(op) ? \
92 ((PyASCIIObject*)(op))->length : \
93 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020094#define _PyUnicode_WSTR(op) \
95 (((PyASCIIObject*)(op))->wstr)
96#define _PyUnicode_WSTR_LENGTH(op) \
97 (((PyCompactUnicodeObject*)(op))->wstr_length)
98#define _PyUnicode_LENGTH(op) \
99 (((PyASCIIObject *)(op))->length)
100#define _PyUnicode_STATE(op) \
101 (((PyASCIIObject *)(op))->state)
102#define _PyUnicode_HASH(op) \
103 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200104#define _PyUnicode_KIND(op) \
105 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200107#define _PyUnicode_GET_LENGTH(op) \
108 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200109 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200110#define _PyUnicode_DATA_ANY(op) \
111 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200112
Victor Stinner910337b2011-10-03 03:20:16 +0200113#undef PyUnicode_READY
114#define PyUnicode_READY(op) \
115 (assert(_PyUnicode_CHECK(op)), \
116 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200117 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100118 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200119
Victor Stinnerc379ead2011-10-03 12:52:27 +0200120#define _PyUnicode_SHARE_UTF8(op) \
121 (assert(_PyUnicode_CHECK(op)), \
122 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
123 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
124#define _PyUnicode_SHARE_WSTR(op) \
125 (assert(_PyUnicode_CHECK(op)), \
126 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
127
Victor Stinner829c0ad2011-10-03 01:08:02 +0200128/* true if the Unicode object has an allocated UTF-8 memory block
129 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200130#define _PyUnicode_HAS_UTF8_MEMORY(op) \
131 (assert(_PyUnicode_CHECK(op)), \
132 (!PyUnicode_IS_COMPACT_ASCII(op) \
133 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200134 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
135
Victor Stinner03490912011-10-03 23:45:12 +0200136/* true if the Unicode object has an allocated wstr memory block
137 (not shared with other data) */
138#define _PyUnicode_HAS_WSTR_MEMORY(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 (_PyUnicode_WSTR(op) && \
141 (!PyUnicode_IS_READY(op) || \
142 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
143
Victor Stinner910337b2011-10-03 03:20:16 +0200144/* Generic helper macro to convert characters of different types.
145 from_type and to_type have to be valid type names, begin and end
146 are pointers to the source characters which should be of type
147 "from_type *". to is a pointer of type "to_type *" and points to the
148 buffer where the result characters are written to. */
149#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
150 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200151 to_type *_to = (to_type *) to; \
152 const from_type *_iter = (begin); \
153 const from_type *_end = (end); \
154 Py_ssize_t n = (_end) - (_iter); \
155 const from_type *_unrolled_end = \
156 _iter + (n & ~ (Py_ssize_t) 3); \
157 while (_iter < (_unrolled_end)) { \
158 _to[0] = (to_type) _iter[0]; \
159 _to[1] = (to_type) _iter[1]; \
160 _to[2] = (to_type) _iter[2]; \
161 _to[3] = (to_type) _iter[3]; \
162 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200163 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200164 while (_iter < (_end)) \
165 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200166 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200167
Walter Dörwald16807132007-05-25 13:52:07 +0000168/* This dictionary holds all interned unicode strings. Note that references
169 to strings in this dictionary are *not* counted in the string's ob_refcnt.
170 When the interned string reaches a refcnt of 0 the string deallocation
171 function will delete the reference from this dictionary.
172
173 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000174 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000175*/
176static PyObject *interned;
177
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000178/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200179static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000180
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200181/* List of static strings. */
182static _Py_Identifier *static_strings;
183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184/* Single character Unicode strings in the Latin-1 range are being
185 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200186static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000187
Christian Heimes190d79e2008-01-30 11:58:22 +0000188/* Fast detection of the most frequent whitespace characters */
189const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000190 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000191/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000192/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000193/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000194/* case 0x000C: * FORM FEED */
195/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000196 0, 1, 1, 1, 1, 1, 0, 0,
197 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000198/* case 0x001C: * FILE SEPARATOR */
199/* case 0x001D: * GROUP SEPARATOR */
200/* case 0x001E: * RECORD SEPARATOR */
201/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000202 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000203/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000204 1, 0, 0, 0, 0, 0, 0, 0,
205 0, 0, 0, 0, 0, 0, 0, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
207 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000208
Benjamin Peterson14339b62009-01-31 16:36:08 +0000209 0, 0, 0, 0, 0, 0, 0, 0,
210 0, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000217};
218
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200219/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200220static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200221static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200222static void copy_characters(
223 PyObject *to, Py_ssize_t to_start,
224 PyObject *from, Py_ssize_t from_start,
225 Py_ssize_t how_many);
Victor Stinner488fa492011-12-12 00:01:39 +0100226static int unicode_modifiable(PyObject *unicode);
227
Victor Stinnerfe226c02011-10-03 03:52:20 +0200228
Alexander Belopolsky40018472011-02-26 01:02:56 +0000229static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200230unicode_fromascii(const unsigned char *s, Py_ssize_t size);
231static PyObject *
232_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
233static PyObject *
234_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
235static PyObject *
236_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
237
238static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000239unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000240 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100241 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000242 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
243
Alexander Belopolsky40018472011-02-26 01:02:56 +0000244static void
245raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300246 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100247 PyObject *unicode,
248 Py_ssize_t startpos, Py_ssize_t endpos,
249 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000250
Christian Heimes190d79e2008-01-30 11:58:22 +0000251/* Same for linebreaks */
252static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000253 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000254/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000255/* 0x000B, * LINE TABULATION */
256/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000257/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000258 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000259 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000260/* 0x001C, * FILE SEPARATOR */
261/* 0x001D, * GROUP SEPARATOR */
262/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000263 0, 0, 0, 0, 1, 1, 1, 0,
264 0, 0, 0, 0, 0, 0, 0, 0,
265 0, 0, 0, 0, 0, 0, 0, 0,
266 0, 0, 0, 0, 0, 0, 0, 0,
267 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000268
Benjamin Peterson14339b62009-01-31 16:36:08 +0000269 0, 0, 0, 0, 0, 0, 0, 0,
270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000277};
278
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300279/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
280 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000281Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000282PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000283{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000284#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000285 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000286#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000287 /* This is actually an illegal character, so it should
288 not be passed to unichr. */
289 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000290#endif
291}
292
Victor Stinner910337b2011-10-03 03:20:16 +0200293#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200294int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100295_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200296{
297 PyASCIIObject *ascii;
298 unsigned int kind;
299
300 assert(PyUnicode_Check(op));
301
302 ascii = (PyASCIIObject *)op;
303 kind = ascii->state.kind;
304
Victor Stinnera3b334d2011-10-03 13:53:37 +0200305 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200306 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200307 assert(ascii->state.ready == 1);
308 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200309 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200310 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200311 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200312
Victor Stinnera41463c2011-10-04 01:05:08 +0200313 if (ascii->state.compact == 1) {
314 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200315 assert(kind == PyUnicode_1BYTE_KIND
316 || kind == PyUnicode_2BYTE_KIND
317 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200318 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200319 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200320 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100321 }
322 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200323 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
324
325 data = unicode->data.any;
326 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100327 assert(ascii->length == 0);
328 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200329 assert(ascii->state.compact == 0);
330 assert(ascii->state.ascii == 0);
331 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100332 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200333 assert(ascii->wstr != NULL);
334 assert(data == NULL);
335 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200336 }
337 else {
338 assert(kind == PyUnicode_1BYTE_KIND
339 || kind == PyUnicode_2BYTE_KIND
340 || kind == PyUnicode_4BYTE_KIND);
341 assert(ascii->state.compact == 0);
342 assert(ascii->state.ready == 1);
343 assert(data != NULL);
344 if (ascii->state.ascii) {
345 assert (compact->utf8 == data);
346 assert (compact->utf8_length == ascii->length);
347 }
348 else
349 assert (compact->utf8 != data);
350 }
351 }
352 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200353 if (
354#if SIZEOF_WCHAR_T == 2
355 kind == PyUnicode_2BYTE_KIND
356#else
357 kind == PyUnicode_4BYTE_KIND
358#endif
359 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200360 {
361 assert(ascii->wstr == data);
362 assert(compact->wstr_length == ascii->length);
363 } else
364 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200365 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200366
367 if (compact->utf8 == NULL)
368 assert(compact->utf8_length == 0);
369 if (ascii->wstr == NULL)
370 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200371 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200372 /* check that the best kind is used */
373 if (check_content && kind != PyUnicode_WCHAR_KIND)
374 {
375 Py_ssize_t i;
376 Py_UCS4 maxchar = 0;
377 void *data = PyUnicode_DATA(ascii);
378 for (i=0; i < ascii->length; i++)
379 {
380 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
381 if (ch > maxchar)
382 maxchar = ch;
383 }
384 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100385 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200386 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100387 assert(maxchar <= 255);
388 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200389 else
390 assert(maxchar < 128);
391 }
Victor Stinner77faf692011-11-20 18:56:05 +0100392 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200393 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100394 assert(maxchar <= 0xFFFF);
395 }
396 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200397 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100398 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100399 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200400 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400401 return 1;
402}
Victor Stinner910337b2011-10-03 03:20:16 +0200403#endif
404
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100405static PyObject*
406unicode_result_wchar(PyObject *unicode)
407{
408#ifndef Py_DEBUG
409 Py_ssize_t len;
410
411 assert(Py_REFCNT(unicode) == 1);
412
413 len = _PyUnicode_WSTR_LENGTH(unicode);
414 if (len == 0) {
415 Py_INCREF(unicode_empty);
416 Py_DECREF(unicode);
417 return unicode_empty;
418 }
419
420 if (len == 1) {
421 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
422 if (ch < 256) {
423 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
424 Py_DECREF(unicode);
425 return latin1_char;
426 }
427 }
428
429 if (_PyUnicode_Ready(unicode) < 0) {
430 Py_XDECREF(unicode);
431 return NULL;
432 }
433#else
434 /* don't make the result ready in debug mode to ensure that the caller
435 makes the string ready before using it */
436 assert(_PyUnicode_CheckConsistency(unicode, 1));
437#endif
438 return unicode;
439}
440
441static PyObject*
442unicode_result_ready(PyObject *unicode)
443{
444 Py_ssize_t length;
445
446 length = PyUnicode_GET_LENGTH(unicode);
447 if (length == 0) {
448 if (unicode != unicode_empty) {
449 Py_INCREF(unicode_empty);
450 Py_DECREF(unicode);
451 }
452 return unicode_empty;
453 }
454
455 if (length == 1) {
456 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
457 if (ch < 256) {
458 PyObject *latin1_char = unicode_latin1[ch];
459 if (latin1_char != NULL) {
460 if (unicode != latin1_char) {
461 Py_INCREF(latin1_char);
462 Py_DECREF(unicode);
463 }
464 return latin1_char;
465 }
466 else {
467 assert(_PyUnicode_CheckConsistency(unicode, 1));
468 Py_INCREF(unicode);
469 unicode_latin1[ch] = unicode;
470 return unicode;
471 }
472 }
473 }
474
475 assert(_PyUnicode_CheckConsistency(unicode, 1));
476 return unicode;
477}
478
479static PyObject*
480unicode_result(PyObject *unicode)
481{
482 assert(_PyUnicode_CHECK(unicode));
483 if (PyUnicode_IS_READY(unicode))
484 return unicode_result_ready(unicode);
485 else
486 return unicode_result_wchar(unicode);
487}
488
Victor Stinnerc4b49542011-12-11 22:44:26 +0100489static PyObject*
490unicode_result_unchanged(PyObject *unicode)
491{
492 if (PyUnicode_CheckExact(unicode)) {
493 if (PyUnicode_READY(unicode) < 0)
494 return NULL;
495 Py_INCREF(unicode);
496 return unicode;
497 }
498 else
499 /* Subtype -- return genuine unicode string with the same value. */
500 return PyUnicode_Copy(unicode);
501}
502
Victor Stinner3a50e702011-10-18 21:21:00 +0200503#ifdef HAVE_MBCS
504static OSVERSIONINFOEX winver;
505#endif
506
Thomas Wouters477c8d52006-05-27 19:21:47 +0000507/* --- Bloom Filters ----------------------------------------------------- */
508
509/* stuff to implement simple "bloom filters" for Unicode characters.
510 to keep things simple, we use a single bitmask, using the least 5
511 bits from each unicode characters as the bit index. */
512
513/* the linebreak mask is set up by Unicode_Init below */
514
Antoine Pitrouf068f942010-01-13 14:19:12 +0000515#if LONG_BIT >= 128
516#define BLOOM_WIDTH 128
517#elif LONG_BIT >= 64
518#define BLOOM_WIDTH 64
519#elif LONG_BIT >= 32
520#define BLOOM_WIDTH 32
521#else
522#error "LONG_BIT is smaller than 32"
523#endif
524
Thomas Wouters477c8d52006-05-27 19:21:47 +0000525#define BLOOM_MASK unsigned long
526
527static BLOOM_MASK bloom_linebreak;
528
Antoine Pitrouf068f942010-01-13 14:19:12 +0000529#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
530#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000531
Benjamin Peterson29060642009-01-31 22:14:21 +0000532#define BLOOM_LINEBREAK(ch) \
533 ((ch) < 128U ? ascii_linebreak[(ch)] : \
534 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000535
Alexander Belopolsky40018472011-02-26 01:02:56 +0000536Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200537make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000538{
539 /* calculate simple bloom-style bitmask for a given unicode string */
540
Antoine Pitrouf068f942010-01-13 14:19:12 +0000541 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000542 Py_ssize_t i;
543
544 mask = 0;
545 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200546 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000547
548 return mask;
549}
550
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200551#define BLOOM_MEMBER(mask, chr, str) \
552 (BLOOM(mask, chr) \
553 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000554
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200555/* Compilation of templated routines */
556
557#include "stringlib/asciilib.h"
558#include "stringlib/fastsearch.h"
559#include "stringlib/partition.h"
560#include "stringlib/split.h"
561#include "stringlib/count.h"
562#include "stringlib/find.h"
563#include "stringlib/find_max_char.h"
564#include "stringlib/localeutil.h"
565#include "stringlib/undef.h"
566
567#include "stringlib/ucs1lib.h"
568#include "stringlib/fastsearch.h"
569#include "stringlib/partition.h"
570#include "stringlib/split.h"
571#include "stringlib/count.h"
572#include "stringlib/find.h"
573#include "stringlib/find_max_char.h"
574#include "stringlib/localeutil.h"
575#include "stringlib/undef.h"
576
577#include "stringlib/ucs2lib.h"
578#include "stringlib/fastsearch.h"
579#include "stringlib/partition.h"
580#include "stringlib/split.h"
581#include "stringlib/count.h"
582#include "stringlib/find.h"
583#include "stringlib/find_max_char.h"
584#include "stringlib/localeutil.h"
585#include "stringlib/undef.h"
586
587#include "stringlib/ucs4lib.h"
588#include "stringlib/fastsearch.h"
589#include "stringlib/partition.h"
590#include "stringlib/split.h"
591#include "stringlib/count.h"
592#include "stringlib/find.h"
593#include "stringlib/find_max_char.h"
594#include "stringlib/localeutil.h"
595#include "stringlib/undef.h"
596
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200597#include "stringlib/unicodedefs.h"
598#include "stringlib/fastsearch.h"
599#include "stringlib/count.h"
600#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100601#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200602
Guido van Rossumd57fd912000-03-10 22:53:23 +0000603/* --- Unicode Object ----------------------------------------------------- */
604
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200605static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200606fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200607
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200608Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
609 Py_ssize_t size, Py_UCS4 ch,
610 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200611{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200612 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
613
614 switch (kind) {
615 case PyUnicode_1BYTE_KIND:
616 {
617 Py_UCS1 ch1 = (Py_UCS1) ch;
618 if (ch1 == ch)
619 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
620 else
621 return -1;
622 }
623 case PyUnicode_2BYTE_KIND:
624 {
625 Py_UCS2 ch2 = (Py_UCS2) ch;
626 if (ch2 == ch)
627 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
628 else
629 return -1;
630 }
631 case PyUnicode_4BYTE_KIND:
632 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
633 default:
634 assert(0);
635 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200636 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200637}
638
Victor Stinnerfe226c02011-10-03 03:52:20 +0200639static PyObject*
640resize_compact(PyObject *unicode, Py_ssize_t length)
641{
642 Py_ssize_t char_size;
643 Py_ssize_t struct_size;
644 Py_ssize_t new_size;
645 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100646 PyObject *new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200647 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100648 assert(PyUnicode_IS_COMPACT(unicode));
649
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200650 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100651 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200652 struct_size = sizeof(PyASCIIObject);
653 else
654 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200655 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200656
Victor Stinnerfe226c02011-10-03 03:52:20 +0200657 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
Victor Stinner84def372011-12-11 20:04:56 +0100658 Py_DECREF(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200659 PyErr_NoMemory();
660 return NULL;
661 }
662 new_size = (struct_size + (length + 1) * char_size);
663
Victor Stinner84def372011-12-11 20:04:56 +0100664 _Py_DEC_REFTOTAL;
665 _Py_ForgetReference(unicode);
666
667 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
668 if (new_unicode == NULL) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200669 PyObject_Del(unicode);
670 PyErr_NoMemory();
671 return NULL;
672 }
Victor Stinner84def372011-12-11 20:04:56 +0100673 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200674 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100675
Victor Stinnerfe226c02011-10-03 03:52:20 +0200676 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200677 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200678 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100679 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200680 _PyUnicode_WSTR_LENGTH(unicode) = length;
681 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200682 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
683 length, 0);
684 return unicode;
685}
686
Alexander Belopolsky40018472011-02-26 01:02:56 +0000687static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200688resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000689{
Victor Stinner95663112011-10-04 01:03:50 +0200690 wchar_t *wstr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200691 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200692 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000693
Victor Stinnerfe226c02011-10-03 03:52:20 +0200694 if (PyUnicode_IS_READY(unicode)) {
695 Py_ssize_t char_size;
696 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200697 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200698 void *data;
699
700 data = _PyUnicode_DATA_ANY(unicode);
701 assert(data != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200702 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200703 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
704 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinner95663112011-10-04 01:03:50 +0200705 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
706 {
707 PyObject_DEL(_PyUnicode_UTF8(unicode));
708 _PyUnicode_UTF8(unicode) = NULL;
709 _PyUnicode_UTF8_LENGTH(unicode) = 0;
710 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200711
712 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
713 PyErr_NoMemory();
714 return -1;
715 }
716 new_size = (length + 1) * char_size;
717
718 data = (PyObject *)PyObject_REALLOC(data, new_size);
719 if (data == NULL) {
720 PyErr_NoMemory();
721 return -1;
722 }
723 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200724 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200725 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200726 _PyUnicode_WSTR_LENGTH(unicode) = length;
727 }
728 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200729 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200730 _PyUnicode_UTF8_LENGTH(unicode) = length;
731 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200732 _PyUnicode_LENGTH(unicode) = length;
733 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200734 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200735 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200736 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200737 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200738 }
Victor Stinner95663112011-10-04 01:03:50 +0200739 assert(_PyUnicode_WSTR(unicode) != NULL);
740
741 /* check for integer overflow */
742 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
743 PyErr_NoMemory();
744 return -1;
745 }
746 wstr = _PyUnicode_WSTR(unicode);
747 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
748 if (!wstr) {
749 PyErr_NoMemory();
750 return -1;
751 }
752 _PyUnicode_WSTR(unicode) = wstr;
753 _PyUnicode_WSTR(unicode)[length] = 0;
754 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200755 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000756 return 0;
757}
758
Victor Stinnerfe226c02011-10-03 03:52:20 +0200759static PyObject*
760resize_copy(PyObject *unicode, Py_ssize_t length)
761{
762 Py_ssize_t copy_length;
763 if (PyUnicode_IS_COMPACT(unicode)) {
764 PyObject *copy;
765 assert(PyUnicode_IS_READY(unicode));
766
767 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
768 if (copy == NULL)
769 return NULL;
770
771 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200772 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200773 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200774 }
775 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200776 PyObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200777 assert(_PyUnicode_WSTR(unicode) != NULL);
778 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200779 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200780 if (w == NULL)
781 return NULL;
782 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
783 copy_length = Py_MIN(copy_length, length);
784 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
785 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200786 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200787 }
788}
789
Guido van Rossumd57fd912000-03-10 22:53:23 +0000790/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000791 Ux0000 terminated; some code (e.g. new_identifier)
792 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000793
794 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000795 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000796
797*/
798
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200799#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200800static int unicode_old_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200801#endif
802
Alexander Belopolsky40018472011-02-26 01:02:56 +0000803static PyUnicodeObject *
804_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000805{
806 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200807 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000808
Thomas Wouters477c8d52006-05-27 19:21:47 +0000809 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000810 if (length == 0 && unicode_empty != NULL) {
811 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200812 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000813 }
814
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000815 /* Ensure we won't overflow the size. */
816 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
817 return (PyUnicodeObject *)PyErr_NoMemory();
818 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200819 if (length < 0) {
820 PyErr_SetString(PyExc_SystemError,
821 "Negative size passed to _PyUnicode_New");
822 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000823 }
824
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200825#ifdef Py_DEBUG
826 ++unicode_old_new_calls;
827#endif
828
829 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
830 if (unicode == NULL)
831 return NULL;
832 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
833 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
834 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000835 PyErr_NoMemory();
836 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000837 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200838
Jeremy Hyltond8082792003-09-16 19:41:39 +0000839 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000840 * the caller fails before initializing str -- unicode_resize()
841 * reads str[0], and the Keep-Alive optimization can keep memory
842 * allocated for str alive across a call to unicode_dealloc(unicode).
843 * We don't want unicode_resize to read uninitialized memory in
844 * that case.
845 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200846 _PyUnicode_WSTR(unicode)[0] = 0;
847 _PyUnicode_WSTR(unicode)[length] = 0;
848 _PyUnicode_WSTR_LENGTH(unicode) = length;
849 _PyUnicode_HASH(unicode) = -1;
850 _PyUnicode_STATE(unicode).interned = 0;
851 _PyUnicode_STATE(unicode).kind = 0;
852 _PyUnicode_STATE(unicode).compact = 0;
853 _PyUnicode_STATE(unicode).ready = 0;
854 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200855 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200856 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200857 _PyUnicode_UTF8(unicode) = NULL;
858 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100859 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000860 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000861
Benjamin Peterson29060642009-01-31 22:14:21 +0000862 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000863 /* XXX UNREF/NEWREF interface should be more symmetrical */
864 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000865 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000866 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000867 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000868}
869
Victor Stinnerf42dc442011-10-02 23:33:16 +0200870static const char*
871unicode_kind_name(PyObject *unicode)
872{
Victor Stinner42dfd712011-10-03 14:41:45 +0200873 /* don't check consistency: unicode_kind_name() is called from
874 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200875 if (!PyUnicode_IS_COMPACT(unicode))
876 {
877 if (!PyUnicode_IS_READY(unicode))
878 return "wstr";
879 switch(PyUnicode_KIND(unicode))
880 {
881 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200882 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200883 return "legacy ascii";
884 else
885 return "legacy latin1";
886 case PyUnicode_2BYTE_KIND:
887 return "legacy UCS2";
888 case PyUnicode_4BYTE_KIND:
889 return "legacy UCS4";
890 default:
891 return "<legacy invalid kind>";
892 }
893 }
894 assert(PyUnicode_IS_READY(unicode));
895 switch(PyUnicode_KIND(unicode))
896 {
897 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200898 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200899 return "ascii";
900 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200901 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200902 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200903 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200904 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200905 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200906 default:
907 return "<invalid compact kind>";
908 }
909}
910
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200911#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200912static int unicode_new_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200913
914/* Functions wrapping macros for use in debugger */
915char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200916 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200917}
918
919void *_PyUnicode_compact_data(void *unicode) {
920 return _PyUnicode_COMPACT_DATA(unicode);
921}
922void *_PyUnicode_data(void *unicode){
923 printf("obj %p\n", unicode);
924 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
925 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
926 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
927 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
928 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
929 return PyUnicode_DATA(unicode);
930}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200931
932void
933_PyUnicode_Dump(PyObject *op)
934{
935 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200936 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
937 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
938 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200939
Victor Stinnera849a4b2011-10-03 12:12:11 +0200940 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200941 {
942 if (ascii->state.ascii)
943 data = (ascii + 1);
944 else
945 data = (compact + 1);
946 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200947 else
948 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200949 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
950
Victor Stinnera849a4b2011-10-03 12:12:11 +0200951 if (ascii->wstr == data)
952 printf("shared ");
953 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200954
Victor Stinnera3b334d2011-10-03 13:53:37 +0200955 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200956 printf(" (%zu), ", compact->wstr_length);
957 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
958 printf("shared ");
959 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200960 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200961 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200962}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200963#endif
964
965PyObject *
966PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
967{
968 PyObject *obj;
969 PyCompactUnicodeObject *unicode;
970 void *data;
971 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200972 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200973 Py_ssize_t char_size;
974 Py_ssize_t struct_size;
975
976 /* Optimization for empty strings */
977 if (size == 0 && unicode_empty != NULL) {
978 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200979 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200980 }
981
982#ifdef Py_DEBUG
983 ++unicode_new_new_calls;
984#endif
985
Victor Stinner9e9d6892011-10-04 01:02:02 +0200986 is_ascii = 0;
987 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200988 struct_size = sizeof(PyCompactUnicodeObject);
989 if (maxchar < 128) {
990 kind_state = PyUnicode_1BYTE_KIND;
991 char_size = 1;
992 is_ascii = 1;
993 struct_size = sizeof(PyASCIIObject);
994 }
995 else if (maxchar < 256) {
996 kind_state = PyUnicode_1BYTE_KIND;
997 char_size = 1;
998 }
999 else if (maxchar < 65536) {
1000 kind_state = PyUnicode_2BYTE_KIND;
1001 char_size = 2;
1002 if (sizeof(wchar_t) == 2)
1003 is_sharing = 1;
1004 }
1005 else {
1006 kind_state = PyUnicode_4BYTE_KIND;
1007 char_size = 4;
1008 if (sizeof(wchar_t) == 4)
1009 is_sharing = 1;
1010 }
1011
1012 /* Ensure we won't overflow the size. */
1013 if (size < 0) {
1014 PyErr_SetString(PyExc_SystemError,
1015 "Negative size passed to PyUnicode_New");
1016 return NULL;
1017 }
1018 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1019 return PyErr_NoMemory();
1020
1021 /* Duplicated allocation code from _PyObject_New() instead of a call to
1022 * PyObject_New() so we are able to allocate space for the object and
1023 * it's data buffer.
1024 */
1025 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1026 if (obj == NULL)
1027 return PyErr_NoMemory();
1028 obj = PyObject_INIT(obj, &PyUnicode_Type);
1029 if (obj == NULL)
1030 return NULL;
1031
1032 unicode = (PyCompactUnicodeObject *)obj;
1033 if (is_ascii)
1034 data = ((PyASCIIObject*)obj) + 1;
1035 else
1036 data = unicode + 1;
1037 _PyUnicode_LENGTH(unicode) = size;
1038 _PyUnicode_HASH(unicode) = -1;
1039 _PyUnicode_STATE(unicode).interned = 0;
1040 _PyUnicode_STATE(unicode).kind = kind_state;
1041 _PyUnicode_STATE(unicode).compact = 1;
1042 _PyUnicode_STATE(unicode).ready = 1;
1043 _PyUnicode_STATE(unicode).ascii = is_ascii;
1044 if (is_ascii) {
1045 ((char*)data)[size] = 0;
1046 _PyUnicode_WSTR(unicode) = NULL;
1047 }
1048 else if (kind_state == PyUnicode_1BYTE_KIND) {
1049 ((char*)data)[size] = 0;
1050 _PyUnicode_WSTR(unicode) = NULL;
1051 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001052 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001053 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001054 }
1055 else {
1056 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001057 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001058 if (kind_state == PyUnicode_2BYTE_KIND)
1059 ((Py_UCS2*)data)[size] = 0;
1060 else /* kind_state == PyUnicode_4BYTE_KIND */
1061 ((Py_UCS4*)data)[size] = 0;
1062 if (is_sharing) {
1063 _PyUnicode_WSTR_LENGTH(unicode) = size;
1064 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1065 }
1066 else {
1067 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1068 _PyUnicode_WSTR(unicode) = NULL;
1069 }
1070 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01001071 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001072 return obj;
1073}
1074
1075#if SIZEOF_WCHAR_T == 2
1076/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1077 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001078 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001079
1080 This function assumes that unicode can hold one more code point than wstr
1081 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001082static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001083unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001084 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001085{
1086 const wchar_t *iter;
1087 Py_UCS4 *ucs4_out;
1088
Victor Stinner910337b2011-10-03 03:20:16 +02001089 assert(unicode != NULL);
1090 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001091 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1092 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1093
1094 for (iter = begin; iter < end; ) {
1095 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1096 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001097 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1098 && (iter+1) < end
1099 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001100 {
Victor Stinner551ac952011-11-29 22:58:13 +01001101 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001102 iter += 2;
1103 }
1104 else {
1105 *ucs4_out++ = *iter;
1106 iter++;
1107 }
1108 }
1109 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1110 _PyUnicode_GET_LENGTH(unicode)));
1111
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001112}
1113#endif
1114
Victor Stinnercd9950f2011-10-02 00:34:53 +02001115static int
Victor Stinner488fa492011-12-12 00:01:39 +01001116unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001117{
Victor Stinner488fa492011-12-12 00:01:39 +01001118 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001119 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001120 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001121 return -1;
1122 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001123 return 0;
1124}
1125
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001126static int
1127_copy_characters(PyObject *to, Py_ssize_t to_start,
1128 PyObject *from, Py_ssize_t from_start,
1129 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001130{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001131 unsigned int from_kind, to_kind;
1132 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001133 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001134
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001135 assert(PyUnicode_Check(from));
1136 assert(PyUnicode_Check(to));
1137 assert(PyUnicode_IS_READY(from));
1138 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001139
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001140 assert(PyUnicode_GET_LENGTH(from) >= how_many);
1141 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1142 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001143
Victor Stinnerf5ca1a22011-09-28 23:54:59 +02001144 if (how_many == 0)
1145 return 0;
1146
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001147 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001148 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001149 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001150 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001151
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001152#ifdef Py_DEBUG
1153 if (!check_maxchar
1154 && (from_kind > to_kind
1155 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001156 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001157 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1158 Py_UCS4 ch;
1159 Py_ssize_t i;
1160 for (i=0; i < how_many; i++) {
1161 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1162 assert(ch <= to_maxchar);
1163 }
1164 }
1165#endif
1166 fast = (from_kind == to_kind);
1167 if (check_maxchar
1168 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1169 {
1170 /* deny latin1 => ascii */
1171 fast = 0;
1172 }
1173
1174 if (fast) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001175 Py_MEMCPY((char*)to_data + to_kind * to_start,
1176 (char*)from_data + from_kind * from_start,
1177 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001178 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001179 else if (from_kind == PyUnicode_1BYTE_KIND
1180 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001181 {
1182 _PyUnicode_CONVERT_BYTES(
1183 Py_UCS1, Py_UCS2,
1184 PyUnicode_1BYTE_DATA(from) + from_start,
1185 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1186 PyUnicode_2BYTE_DATA(to) + to_start
1187 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001188 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001189 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001190 && to_kind == PyUnicode_4BYTE_KIND)
1191 {
1192 _PyUnicode_CONVERT_BYTES(
1193 Py_UCS1, Py_UCS4,
1194 PyUnicode_1BYTE_DATA(from) + from_start,
1195 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1196 PyUnicode_4BYTE_DATA(to) + to_start
1197 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001198 }
1199 else if (from_kind == PyUnicode_2BYTE_KIND
1200 && to_kind == PyUnicode_4BYTE_KIND)
1201 {
1202 _PyUnicode_CONVERT_BYTES(
1203 Py_UCS2, Py_UCS4,
1204 PyUnicode_2BYTE_DATA(from) + from_start,
1205 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1206 PyUnicode_4BYTE_DATA(to) + to_start
1207 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001208 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001209 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001210 /* check if max_char(from substring) <= max_char(to) */
1211 if (from_kind > to_kind
1212 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001213 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001214 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001215 /* slow path to check for character overflow */
1216 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001217 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001218 Py_ssize_t i;
1219
Victor Stinner56c161a2011-10-06 02:47:11 +02001220#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001221 for (i=0; i < how_many; i++) {
1222 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001223 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001224 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1225 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001226#else
1227 if (!check_maxchar) {
1228 for (i=0; i < how_many; i++) {
1229 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1230 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1231 }
1232 }
1233 else {
1234 for (i=0; i < how_many; i++) {
1235 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1236 if (ch > to_maxchar)
1237 return 1;
1238 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1239 }
1240 }
1241#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001242 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001243 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001244 assert(0 && "inconsistent state");
1245 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001246 }
1247 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001248 return 0;
1249}
1250
1251static void
1252copy_characters(PyObject *to, Py_ssize_t to_start,
1253 PyObject *from, Py_ssize_t from_start,
1254 Py_ssize_t how_many)
1255{
1256 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1257}
1258
1259Py_ssize_t
1260PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1261 PyObject *from, Py_ssize_t from_start,
1262 Py_ssize_t how_many)
1263{
1264 int err;
1265
1266 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1267 PyErr_BadInternalCall();
1268 return -1;
1269 }
1270
1271 if (PyUnicode_READY(from))
1272 return -1;
1273 if (PyUnicode_READY(to))
1274 return -1;
1275
1276 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1277 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1278 PyErr_Format(PyExc_SystemError,
1279 "Cannot write %zi characters at %zi "
1280 "in a string of %zi characters",
1281 how_many, to_start, PyUnicode_GET_LENGTH(to));
1282 return -1;
1283 }
1284
1285 if (how_many == 0)
1286 return 0;
1287
Victor Stinner488fa492011-12-12 00:01:39 +01001288 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001289 return -1;
1290
1291 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1292 if (err) {
1293 PyErr_Format(PyExc_SystemError,
1294 "Cannot copy %s characters "
1295 "into a string of %s characters",
1296 unicode_kind_name(from),
1297 unicode_kind_name(to));
1298 return -1;
1299 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001300 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001301}
1302
Victor Stinner17222162011-09-28 22:15:37 +02001303/* Find the maximum code point and count the number of surrogate pairs so a
1304 correct string length can be computed before converting a string to UCS4.
1305 This function counts single surrogates as a character and not as a pair.
1306
1307 Return 0 on success, or -1 on error. */
1308static int
1309find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1310 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001311{
1312 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001313 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001314
Victor Stinnerc53be962011-10-02 21:33:54 +02001315 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001316 *num_surrogates = 0;
1317 *maxchar = 0;
1318
1319 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320#if SIZEOF_WCHAR_T == 2
Victor Stinnerca4f2072011-11-22 03:38:40 +01001321 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1322 && (iter+1) < end
1323 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001324 {
Victor Stinner8faf8212011-12-08 22:14:11 +01001325 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001326 ++(*num_surrogates);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001327 iter += 2;
1328 }
1329 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001330#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001331 {
1332 ch = *iter;
1333 iter++;
1334 }
1335 if (ch > *maxchar) {
1336 *maxchar = ch;
1337 if (*maxchar > MAX_UNICODE) {
1338 PyErr_Format(PyExc_ValueError,
1339 "character U+%x is not in range [U+0000; U+10ffff]",
1340 ch);
1341 return -1;
1342 }
1343 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001344 }
1345 return 0;
1346}
1347
1348#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001349static int unicode_ready_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001350#endif
1351
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001352int
1353_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001354{
1355 wchar_t *end;
1356 Py_UCS4 maxchar = 0;
1357 Py_ssize_t num_surrogates;
1358#if SIZEOF_WCHAR_T == 2
1359 Py_ssize_t length_wo_surrogates;
1360#endif
1361
Georg Brandl7597add2011-10-05 16:36:47 +02001362 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001363 strings were created using _PyObject_New() and where no canonical
1364 representation (the str field) has been set yet aka strings
1365 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001366 assert(_PyUnicode_CHECK(unicode));
1367 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001368 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001369 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001370 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001371 /* Actually, it should neither be interned nor be anything else: */
1372 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001373
1374#ifdef Py_DEBUG
1375 ++unicode_ready_calls;
1376#endif
1377
1378 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001379 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001380 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001381 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001382
1383 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001384 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1385 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001386 PyErr_NoMemory();
1387 return -1;
1388 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001389 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001390 _PyUnicode_WSTR(unicode), end,
1391 PyUnicode_1BYTE_DATA(unicode));
1392 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1393 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1394 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1395 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001396 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001397 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001398 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001399 }
1400 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001401 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001402 _PyUnicode_UTF8(unicode) = NULL;
1403 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001404 }
1405 PyObject_FREE(_PyUnicode_WSTR(unicode));
1406 _PyUnicode_WSTR(unicode) = NULL;
1407 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1408 }
1409 /* In this case we might have to convert down from 4-byte native
1410 wchar_t to 2-byte unicode. */
1411 else if (maxchar < 65536) {
1412 assert(num_surrogates == 0 &&
1413 "FindMaxCharAndNumSurrogatePairs() messed up");
1414
Victor Stinner506f5922011-09-28 22:34:18 +02001415#if SIZEOF_WCHAR_T == 2
1416 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001417 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001418 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1419 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1420 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001421 _PyUnicode_UTF8(unicode) = NULL;
1422 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001423#else
1424 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001425 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001426 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001427 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001428 PyErr_NoMemory();
1429 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001430 }
Victor Stinner506f5922011-09-28 22:34:18 +02001431 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1432 _PyUnicode_WSTR(unicode), end,
1433 PyUnicode_2BYTE_DATA(unicode));
1434 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1435 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1436 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001437 _PyUnicode_UTF8(unicode) = NULL;
1438 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001439 PyObject_FREE(_PyUnicode_WSTR(unicode));
1440 _PyUnicode_WSTR(unicode) = NULL;
1441 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1442#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001443 }
1444 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1445 else {
1446#if SIZEOF_WCHAR_T == 2
1447 /* in case the native representation is 2-bytes, we need to allocate a
1448 new normalized 4-byte version. */
1449 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001450 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1451 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001452 PyErr_NoMemory();
1453 return -1;
1454 }
1455 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1456 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001457 _PyUnicode_UTF8(unicode) = NULL;
1458 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001459 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1460 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001461 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001462 PyObject_FREE(_PyUnicode_WSTR(unicode));
1463 _PyUnicode_WSTR(unicode) = NULL;
1464 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1465#else
1466 assert(num_surrogates == 0);
1467
Victor Stinnerc3c74152011-10-02 20:39:55 +02001468 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001469 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001470 _PyUnicode_UTF8(unicode) = NULL;
1471 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001472 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1473#endif
1474 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1475 }
1476 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001477 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001478 return 0;
1479}
1480
Alexander Belopolsky40018472011-02-26 01:02:56 +00001481static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001482unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001483{
Walter Dörwald16807132007-05-25 13:52:07 +00001484 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001485 case SSTATE_NOT_INTERNED:
1486 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001487
Benjamin Peterson29060642009-01-31 22:14:21 +00001488 case SSTATE_INTERNED_MORTAL:
1489 /* revive dead object temporarily for DelItem */
1490 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001491 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001492 Py_FatalError(
1493 "deletion of interned string failed");
1494 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001495
Benjamin Peterson29060642009-01-31 22:14:21 +00001496 case SSTATE_INTERNED_IMMORTAL:
1497 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001498
Benjamin Peterson29060642009-01-31 22:14:21 +00001499 default:
1500 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001501 }
1502
Victor Stinner03490912011-10-03 23:45:12 +02001503 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001504 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001505 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001506 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001507
1508 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinner7931d9a2011-11-04 00:22:48 +01001509 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001510 }
1511 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001512 if (_PyUnicode_DATA_ANY(unicode))
1513 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Victor Stinner7931d9a2011-11-04 00:22:48 +01001514 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001515 }
1516}
1517
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001518#ifdef Py_DEBUG
1519static int
1520unicode_is_singleton(PyObject *unicode)
1521{
1522 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1523 if (unicode == unicode_empty)
1524 return 1;
1525 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1526 {
1527 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1528 if (ch < 256 && unicode_latin1[ch] == unicode)
1529 return 1;
1530 }
1531 return 0;
1532}
1533#endif
1534
Alexander Belopolsky40018472011-02-26 01:02:56 +00001535static int
Victor Stinner488fa492011-12-12 00:01:39 +01001536unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001537{
Victor Stinner488fa492011-12-12 00:01:39 +01001538 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001539 if (Py_REFCNT(unicode) != 1)
1540 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001541 if (_PyUnicode_HASH(unicode) != -1)
1542 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001543 if (PyUnicode_CHECK_INTERNED(unicode))
1544 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001545 if (!PyUnicode_CheckExact(unicode))
1546 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001547#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001548 /* singleton refcount is greater than 1 */
1549 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001550#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001551 return 1;
1552}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001553
Victor Stinnerfe226c02011-10-03 03:52:20 +02001554static int
1555unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1556{
1557 PyObject *unicode;
1558 Py_ssize_t old_length;
1559
1560 assert(p_unicode != NULL);
1561 unicode = *p_unicode;
1562
1563 assert(unicode != NULL);
1564 assert(PyUnicode_Check(unicode));
1565 assert(0 <= length);
1566
Victor Stinner910337b2011-10-03 03:20:16 +02001567 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001568 old_length = PyUnicode_WSTR_LENGTH(unicode);
1569 else
1570 old_length = PyUnicode_GET_LENGTH(unicode);
1571 if (old_length == length)
1572 return 0;
1573
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001574 if (length == 0) {
1575 Py_DECREF(*p_unicode);
1576 *p_unicode = unicode_empty;
1577 Py_INCREF(*p_unicode);
1578 return 0;
1579 }
1580
Victor Stinner488fa492011-12-12 00:01:39 +01001581 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001582 PyObject *copy = resize_copy(unicode, length);
1583 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001584 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001585 Py_DECREF(*p_unicode);
1586 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001587 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001588 }
1589
Victor Stinnerfe226c02011-10-03 03:52:20 +02001590 if (PyUnicode_IS_COMPACT(unicode)) {
1591 *p_unicode = resize_compact(unicode, length);
1592 if (*p_unicode == NULL)
1593 return -1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001594 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001595 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001596 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001597 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001598}
1599
Alexander Belopolsky40018472011-02-26 01:02:56 +00001600int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001601PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001602{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001603 PyObject *unicode;
1604 if (p_unicode == NULL) {
1605 PyErr_BadInternalCall();
1606 return -1;
1607 }
1608 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001609 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001610 {
1611 PyErr_BadInternalCall();
1612 return -1;
1613 }
1614 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001615}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001616
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001617static int
Victor Stinner0a045ef2011-11-09 00:02:42 +01001618unicode_widen(PyObject **p_unicode, unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001619{
1620 PyObject *result;
1621 assert(PyUnicode_IS_READY(*p_unicode));
1622 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1623 return 0;
1624 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1625 maxchar);
1626 if (result == NULL)
1627 return -1;
1628 PyUnicode_CopyCharacters(result, 0, *p_unicode, 0,
1629 PyUnicode_GET_LENGTH(*p_unicode));
1630 Py_DECREF(*p_unicode);
1631 *p_unicode = result;
1632 return 0;
1633}
1634
1635static int
1636unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1637 Py_UCS4 ch)
1638{
1639 if (unicode_widen(p_unicode, ch) < 0)
1640 return -1;
1641 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1642 PyUnicode_DATA(*p_unicode),
1643 (*pos)++, ch);
1644 return 0;
1645}
1646
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001647static PyObject*
1648get_latin1_char(unsigned char ch)
1649{
Victor Stinnera464fc12011-10-02 20:39:30 +02001650 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001651 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001652 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001653 if (!unicode)
1654 return NULL;
1655 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001656 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001657 unicode_latin1[ch] = unicode;
1658 }
1659 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001660 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001661}
1662
Alexander Belopolsky40018472011-02-26 01:02:56 +00001663PyObject *
1664PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001665{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001666 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001667 Py_UCS4 maxchar = 0;
1668 Py_ssize_t num_surrogates;
1669
1670 if (u == NULL)
1671 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001672
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001673 /* If the Unicode data is known at construction time, we can apply
1674 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001675
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001676 /* Optimization for empty strings */
1677 if (size == 0 && unicode_empty != NULL) {
1678 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001679 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001680 }
Tim Petersced69f82003-09-16 20:30:58 +00001681
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001682 /* Single character Unicode objects in the Latin-1 range are
1683 shared when using this constructor */
1684 if (size == 1 && *u < 256)
1685 return get_latin1_char((unsigned char)*u);
1686
1687 /* If not empty and not single character, copy the Unicode data
1688 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001689 if (find_maxchar_surrogates(u, u + size,
1690 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001691 return NULL;
1692
Victor Stinner8faf8212011-12-08 22:14:11 +01001693 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001694 if (!unicode)
1695 return NULL;
1696
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001697 switch (PyUnicode_KIND(unicode)) {
1698 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001699 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001700 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1701 break;
1702 case PyUnicode_2BYTE_KIND:
1703#if Py_UNICODE_SIZE == 2
1704 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1705#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001706 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001707 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1708#endif
1709 break;
1710 case PyUnicode_4BYTE_KIND:
1711#if SIZEOF_WCHAR_T == 2
1712 /* This is the only case which has to process surrogates, thus
1713 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001714 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001715#else
1716 assert(num_surrogates == 0);
1717 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1718#endif
1719 break;
1720 default:
1721 assert(0 && "Impossible state");
1722 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001723
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001724 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001725}
1726
Alexander Belopolsky40018472011-02-26 01:02:56 +00001727PyObject *
1728PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001729{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001730 if (size < 0) {
1731 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001732 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001733 return NULL;
1734 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001735 if (u != NULL)
1736 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1737 else
1738 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001739}
1740
Alexander Belopolsky40018472011-02-26 01:02:56 +00001741PyObject *
1742PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001743{
1744 size_t size = strlen(u);
1745 if (size > PY_SSIZE_T_MAX) {
1746 PyErr_SetString(PyExc_OverflowError, "input too long");
1747 return NULL;
1748 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001749 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001750}
1751
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001752PyObject *
1753_PyUnicode_FromId(_Py_Identifier *id)
1754{
1755 if (!id->object) {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001756 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1757 strlen(id->string),
1758 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001759 if (!id->object)
1760 return NULL;
1761 PyUnicode_InternInPlace(&id->object);
1762 assert(!id->next);
1763 id->next = static_strings;
1764 static_strings = id;
1765 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001766 return id->object;
1767}
1768
1769void
1770_PyUnicode_ClearStaticStrings()
1771{
1772 _Py_Identifier *i;
1773 for (i = static_strings; i; i = i->next) {
1774 Py_DECREF(i->object);
1775 i->object = NULL;
1776 i->next = NULL;
1777 }
1778}
1779
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001780/* Internal function, don't check maximum character */
1781
Victor Stinnere57b1c02011-09-28 22:20:48 +02001782static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001783unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001784{
Victor Stinner785938e2011-12-11 20:09:03 +01001785 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001786 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001787#ifdef Py_DEBUG
Victor Stinnere6b2d442011-12-11 21:54:30 +01001788 assert(s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001789#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001790 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001791 }
Victor Stinner785938e2011-12-11 20:09:03 +01001792 unicode = PyUnicode_New(size, 127);
1793 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001794 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001795 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1796 assert(_PyUnicode_CheckConsistency(unicode, 1));
1797 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001798}
1799
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001800static Py_UCS4
1801kind_maxchar_limit(unsigned int kind)
1802{
1803 switch(kind) {
1804 case PyUnicode_1BYTE_KIND:
1805 return 0x80;
1806 case PyUnicode_2BYTE_KIND:
1807 return 0x100;
1808 case PyUnicode_4BYTE_KIND:
1809 return 0x10000;
1810 default:
1811 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001812 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001813 }
1814}
1815
Victor Stinner702c7342011-10-05 13:50:52 +02001816static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001817_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001818{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001819 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001820 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001821
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001822 if (size == 0) {
1823 Py_INCREF(unicode_empty);
1824 return unicode_empty;
1825 }
1826 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001827 if (size == 1)
1828 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001829
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001830 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001831 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001832 if (!res)
1833 return NULL;
1834 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001835 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001836 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001837}
1838
Victor Stinnere57b1c02011-09-28 22:20:48 +02001839static PyObject*
1840_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001841{
1842 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001843 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001844
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001845 if (size == 0) {
1846 Py_INCREF(unicode_empty);
1847 return unicode_empty;
1848 }
1849 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001850 if (size == 1 && u[0] < 256)
Victor Stinner4e101002011-10-11 23:27:52 +02001851 return get_latin1_char((unsigned char)u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001852
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001853 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001854 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001855 if (!res)
1856 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001857 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001858 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001859 else {
1860 _PyUnicode_CONVERT_BYTES(
1861 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1862 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001863 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001864 return res;
1865}
1866
Victor Stinnere57b1c02011-09-28 22:20:48 +02001867static PyObject*
1868_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001869{
1870 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001871 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001872
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001873 if (size == 0) {
1874 Py_INCREF(unicode_empty);
1875 return unicode_empty;
1876 }
1877 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001878 if (size == 1 && u[0] < 256)
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001879 return get_latin1_char((unsigned char)u[0]);
1880
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001881 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001882 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001883 if (!res)
1884 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001885 if (max_char < 256)
1886 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1887 PyUnicode_1BYTE_DATA(res));
1888 else if (max_char < 0x10000)
1889 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1890 PyUnicode_2BYTE_DATA(res));
1891 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001892 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001893 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001894 return res;
1895}
1896
1897PyObject*
1898PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1899{
Victor Stinnercfed46e2011-11-22 01:29:14 +01001900 if (size < 0) {
1901 PyErr_SetString(PyExc_ValueError, "size must be positive");
1902 return NULL;
1903 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001904 switch(kind) {
1905 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001906 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001907 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001908 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001909 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001910 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001911 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02001912 PyErr_SetString(PyExc_SystemError, "invalid kind");
1913 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001914 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001915}
1916
Victor Stinner25a4b292011-10-06 12:31:55 +02001917/* Ensure that a string uses the most efficient storage, if it is not the
1918 case: create a new string with of the right kind. Write NULL into *p_unicode
1919 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001920static void
Victor Stinner25a4b292011-10-06 12:31:55 +02001921unicode_adjust_maxchar(PyObject **p_unicode)
1922{
1923 PyObject *unicode, *copy;
1924 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001925 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02001926 unsigned int kind;
1927
1928 assert(p_unicode != NULL);
1929 unicode = *p_unicode;
1930 assert(PyUnicode_IS_READY(unicode));
1931 if (PyUnicode_IS_ASCII(unicode))
1932 return;
1933
1934 len = PyUnicode_GET_LENGTH(unicode);
1935 kind = PyUnicode_KIND(unicode);
1936 if (kind == PyUnicode_1BYTE_KIND) {
1937 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001938 max_char = ucs1lib_find_max_char(u, u + len);
1939 if (max_char >= 128)
1940 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001941 }
1942 else if (kind == PyUnicode_2BYTE_KIND) {
1943 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001944 max_char = ucs2lib_find_max_char(u, u + len);
1945 if (max_char >= 256)
1946 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001947 }
1948 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001949 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02001950 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001951 max_char = ucs4lib_find_max_char(u, u + len);
1952 if (max_char >= 0x10000)
1953 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001954 }
Victor Stinner25a4b292011-10-06 12:31:55 +02001955 copy = PyUnicode_New(len, max_char);
1956 copy_characters(copy, 0, unicode, 0, len);
1957 Py_DECREF(unicode);
1958 *p_unicode = copy;
1959}
1960
Victor Stinner034f6cf2011-09-30 02:26:44 +02001961PyObject*
1962PyUnicode_Copy(PyObject *unicode)
1963{
Victor Stinner87af4f22011-11-21 23:03:47 +01001964 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001965 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001966
Victor Stinner034f6cf2011-09-30 02:26:44 +02001967 if (!PyUnicode_Check(unicode)) {
1968 PyErr_BadInternalCall();
1969 return NULL;
1970 }
1971 if (PyUnicode_READY(unicode))
1972 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001973
Victor Stinner87af4f22011-11-21 23:03:47 +01001974 length = PyUnicode_GET_LENGTH(unicode);
1975 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001976 if (!copy)
1977 return NULL;
1978 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1979
Victor Stinner87af4f22011-11-21 23:03:47 +01001980 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
1981 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001982 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001983 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001984}
1985
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001986
Victor Stinnerbc603d12011-10-02 01:00:40 +02001987/* Widen Unicode objects to larger buffers. Don't write terminating null
1988 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001989
1990void*
1991_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1992{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001993 Py_ssize_t len;
1994 void *result;
1995 unsigned int skind;
1996
1997 if (PyUnicode_READY(s))
1998 return NULL;
1999
2000 len = PyUnicode_GET_LENGTH(s);
2001 skind = PyUnicode_KIND(s);
2002 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002003 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002004 return NULL;
2005 }
2006 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002007 case PyUnicode_2BYTE_KIND:
2008 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2009 if (!result)
2010 return PyErr_NoMemory();
2011 assert(skind == PyUnicode_1BYTE_KIND);
2012 _PyUnicode_CONVERT_BYTES(
2013 Py_UCS1, Py_UCS2,
2014 PyUnicode_1BYTE_DATA(s),
2015 PyUnicode_1BYTE_DATA(s) + len,
2016 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002017 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002018 case PyUnicode_4BYTE_KIND:
2019 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2020 if (!result)
2021 return PyErr_NoMemory();
2022 if (skind == PyUnicode_2BYTE_KIND) {
2023 _PyUnicode_CONVERT_BYTES(
2024 Py_UCS2, Py_UCS4,
2025 PyUnicode_2BYTE_DATA(s),
2026 PyUnicode_2BYTE_DATA(s) + len,
2027 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002028 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002029 else {
2030 assert(skind == PyUnicode_1BYTE_KIND);
2031 _PyUnicode_CONVERT_BYTES(
2032 Py_UCS1, Py_UCS4,
2033 PyUnicode_1BYTE_DATA(s),
2034 PyUnicode_1BYTE_DATA(s) + len,
2035 result);
2036 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002037 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002038 default:
2039 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002040 }
Victor Stinner01698042011-10-04 00:04:26 +02002041 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002042 return NULL;
2043}
2044
2045static Py_UCS4*
2046as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2047 int copy_null)
2048{
2049 int kind;
2050 void *data;
2051 Py_ssize_t len, targetlen;
2052 if (PyUnicode_READY(string) == -1)
2053 return NULL;
2054 kind = PyUnicode_KIND(string);
2055 data = PyUnicode_DATA(string);
2056 len = PyUnicode_GET_LENGTH(string);
2057 targetlen = len;
2058 if (copy_null)
2059 targetlen++;
2060 if (!target) {
2061 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2062 PyErr_NoMemory();
2063 return NULL;
2064 }
2065 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2066 if (!target) {
2067 PyErr_NoMemory();
2068 return NULL;
2069 }
2070 }
2071 else {
2072 if (targetsize < targetlen) {
2073 PyErr_Format(PyExc_SystemError,
2074 "string is longer than the buffer");
2075 if (copy_null && 0 < targetsize)
2076 target[0] = 0;
2077 return NULL;
2078 }
2079 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002080 if (kind == PyUnicode_1BYTE_KIND) {
2081 Py_UCS1 *start = (Py_UCS1 *) data;
2082 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002083 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002084 else if (kind == PyUnicode_2BYTE_KIND) {
2085 Py_UCS2 *start = (Py_UCS2 *) data;
2086 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2087 }
2088 else {
2089 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002090 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002091 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002092 if (copy_null)
2093 target[len] = 0;
2094 return target;
2095}
2096
2097Py_UCS4*
2098PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2099 int copy_null)
2100{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002101 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002102 PyErr_BadInternalCall();
2103 return NULL;
2104 }
2105 return as_ucs4(string, target, targetsize, copy_null);
2106}
2107
2108Py_UCS4*
2109PyUnicode_AsUCS4Copy(PyObject *string)
2110{
2111 return as_ucs4(string, NULL, 0, 1);
2112}
2113
2114#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002115
Alexander Belopolsky40018472011-02-26 01:02:56 +00002116PyObject *
2117PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002118{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002119 if (w == NULL) {
Victor Stinner382955f2011-12-11 21:44:00 +01002120 if (size == 0) {
2121 Py_INCREF(unicode_empty);
2122 return unicode_empty;
2123 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002124 PyErr_BadInternalCall();
2125 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002126 }
2127
Martin v. Löwis790465f2008-04-05 20:41:37 +00002128 if (size == -1) {
2129 size = wcslen(w);
2130 }
2131
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002132 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002133}
2134
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002135#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002136
Walter Dörwald346737f2007-05-31 10:44:43 +00002137static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002138makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2139 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002140{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002141 *fmt++ = '%';
2142 if (width) {
2143 if (zeropad)
2144 *fmt++ = '0';
2145 fmt += sprintf(fmt, "%d", width);
2146 }
2147 if (precision)
2148 fmt += sprintf(fmt, ".%d", precision);
2149 if (longflag)
2150 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002151 else if (longlongflag) {
2152 /* longlongflag should only ever be nonzero on machines with
2153 HAVE_LONG_LONG defined */
2154#ifdef HAVE_LONG_LONG
2155 char *f = PY_FORMAT_LONG_LONG;
2156 while (*f)
2157 *fmt++ = *f++;
2158#else
2159 /* we shouldn't ever get here */
2160 assert(0);
2161 *fmt++ = 'l';
2162#endif
2163 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002164 else if (size_tflag) {
2165 char *f = PY_FORMAT_SIZE_T;
2166 while (*f)
2167 *fmt++ = *f++;
2168 }
2169 *fmt++ = c;
2170 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002171}
2172
Victor Stinner96865452011-03-01 23:44:09 +00002173/* helper for PyUnicode_FromFormatV() */
2174
2175static const char*
2176parse_format_flags(const char *f,
2177 int *p_width, int *p_precision,
2178 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2179{
2180 int width, precision, longflag, longlongflag, size_tflag;
2181
2182 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2183 f++;
2184 width = 0;
2185 while (Py_ISDIGIT((unsigned)*f))
2186 width = (width*10) + *f++ - '0';
2187 precision = 0;
2188 if (*f == '.') {
2189 f++;
2190 while (Py_ISDIGIT((unsigned)*f))
2191 precision = (precision*10) + *f++ - '0';
2192 if (*f == '%') {
2193 /* "%.3%s" => f points to "3" */
2194 f--;
2195 }
2196 }
2197 if (*f == '\0') {
2198 /* bogus format "%.1" => go backward, f points to "1" */
2199 f--;
2200 }
2201 if (p_width != NULL)
2202 *p_width = width;
2203 if (p_precision != NULL)
2204 *p_precision = precision;
2205
2206 /* Handle %ld, %lu, %lld and %llu. */
2207 longflag = 0;
2208 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002209 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002210
2211 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002212 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002213 longflag = 1;
2214 ++f;
2215 }
2216#ifdef HAVE_LONG_LONG
2217 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002218 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002219 longlongflag = 1;
2220 f += 2;
2221 }
2222#endif
2223 }
2224 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002225 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002226 size_tflag = 1;
2227 ++f;
2228 }
2229 if (p_longflag != NULL)
2230 *p_longflag = longflag;
2231 if (p_longlongflag != NULL)
2232 *p_longlongflag = longlongflag;
2233 if (p_size_tflag != NULL)
2234 *p_size_tflag = size_tflag;
2235 return f;
2236}
2237
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002238/* maximum number of characters required for output of %ld. 21 characters
2239 allows for 64-bit integers (in decimal) and an optional sign. */
2240#define MAX_LONG_CHARS 21
2241/* maximum number of characters required for output of %lld.
2242 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2243 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2244#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2245
Walter Dörwaldd2034312007-05-18 16:29:38 +00002246PyObject *
2247PyUnicode_FromFormatV(const char *format, va_list vargs)
2248{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002249 va_list count;
2250 Py_ssize_t callcount = 0;
2251 PyObject **callresults = NULL;
2252 PyObject **callresult = NULL;
2253 Py_ssize_t n = 0;
2254 int width = 0;
2255 int precision = 0;
2256 int zeropad;
2257 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002258 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002259 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002260 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002261 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2262 Py_UCS4 argmaxchar;
2263 Py_ssize_t numbersize = 0;
2264 char *numberresults = NULL;
2265 char *numberresult = NULL;
2266 Py_ssize_t i;
2267 int kind;
2268 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002269
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002270 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002271 /* step 1: count the number of %S/%R/%A/%s format specifications
2272 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2273 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002274 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002275 * also estimate a upper bound for all the number formats in the string,
2276 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002277 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002278 for (f = format; *f; f++) {
2279 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002280 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002281 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2282 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2283 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2284 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002285
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002286 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002287#ifdef HAVE_LONG_LONG
2288 if (longlongflag) {
2289 if (width < MAX_LONG_LONG_CHARS)
2290 width = MAX_LONG_LONG_CHARS;
2291 }
2292 else
2293#endif
2294 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2295 including sign. Decimal takes the most space. This
2296 isn't enough for octal. If a width is specified we
2297 need more (which we allocate later). */
2298 if (width < MAX_LONG_CHARS)
2299 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002300
2301 /* account for the size + '\0' to separate numbers
2302 inside of the numberresults buffer */
2303 numbersize += (width + 1);
2304 }
2305 }
2306 else if ((unsigned char)*f > 127) {
2307 PyErr_Format(PyExc_ValueError,
2308 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2309 "string, got a non-ASCII byte: 0x%02x",
2310 (unsigned char)*f);
2311 return NULL;
2312 }
2313 }
2314 /* step 2: allocate memory for the results of
2315 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2316 if (callcount) {
2317 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2318 if (!callresults) {
2319 PyErr_NoMemory();
2320 return NULL;
2321 }
2322 callresult = callresults;
2323 }
2324 /* step 2.5: allocate memory for the results of formating numbers */
2325 if (numbersize) {
2326 numberresults = PyObject_Malloc(numbersize);
2327 if (!numberresults) {
2328 PyErr_NoMemory();
2329 goto fail;
2330 }
2331 numberresult = numberresults;
2332 }
2333
2334 /* step 3: format numbers and figure out how large a buffer we need */
2335 for (f = format; *f; f++) {
2336 if (*f == '%') {
2337 const char* p;
2338 int longflag;
2339 int longlongflag;
2340 int size_tflag;
2341 int numprinted;
2342
2343 p = f;
2344 zeropad = (f[1] == '0');
2345 f = parse_format_flags(f, &width, &precision,
2346 &longflag, &longlongflag, &size_tflag);
2347 switch (*f) {
2348 case 'c':
2349 {
2350 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002351 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002352 n++;
2353 break;
2354 }
2355 case '%':
2356 n++;
2357 break;
2358 case 'i':
2359 case 'd':
2360 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2361 width, precision, *f);
2362 if (longflag)
2363 numprinted = sprintf(numberresult, fmt,
2364 va_arg(count, long));
2365#ifdef HAVE_LONG_LONG
2366 else if (longlongflag)
2367 numprinted = sprintf(numberresult, fmt,
2368 va_arg(count, PY_LONG_LONG));
2369#endif
2370 else if (size_tflag)
2371 numprinted = sprintf(numberresult, fmt,
2372 va_arg(count, Py_ssize_t));
2373 else
2374 numprinted = sprintf(numberresult, fmt,
2375 va_arg(count, int));
2376 n += numprinted;
2377 /* advance by +1 to skip over the '\0' */
2378 numberresult += (numprinted + 1);
2379 assert(*(numberresult - 1) == '\0');
2380 assert(*(numberresult - 2) != '\0');
2381 assert(numprinted >= 0);
2382 assert(numberresult <= numberresults + numbersize);
2383 break;
2384 case 'u':
2385 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2386 width, precision, 'u');
2387 if (longflag)
2388 numprinted = sprintf(numberresult, fmt,
2389 va_arg(count, unsigned long));
2390#ifdef HAVE_LONG_LONG
2391 else if (longlongflag)
2392 numprinted = sprintf(numberresult, fmt,
2393 va_arg(count, unsigned PY_LONG_LONG));
2394#endif
2395 else if (size_tflag)
2396 numprinted = sprintf(numberresult, fmt,
2397 va_arg(count, size_t));
2398 else
2399 numprinted = sprintf(numberresult, fmt,
2400 va_arg(count, unsigned int));
2401 n += numprinted;
2402 numberresult += (numprinted + 1);
2403 assert(*(numberresult - 1) == '\0');
2404 assert(*(numberresult - 2) != '\0');
2405 assert(numprinted >= 0);
2406 assert(numberresult <= numberresults + numbersize);
2407 break;
2408 case 'x':
2409 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2410 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2411 n += numprinted;
2412 numberresult += (numprinted + 1);
2413 assert(*(numberresult - 1) == '\0');
2414 assert(*(numberresult - 2) != '\0');
2415 assert(numprinted >= 0);
2416 assert(numberresult <= numberresults + numbersize);
2417 break;
2418 case 'p':
2419 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2420 /* %p is ill-defined: ensure leading 0x. */
2421 if (numberresult[1] == 'X')
2422 numberresult[1] = 'x';
2423 else if (numberresult[1] != 'x') {
2424 memmove(numberresult + 2, numberresult,
2425 strlen(numberresult) + 1);
2426 numberresult[0] = '0';
2427 numberresult[1] = 'x';
2428 numprinted += 2;
2429 }
2430 n += numprinted;
2431 numberresult += (numprinted + 1);
2432 assert(*(numberresult - 1) == '\0');
2433 assert(*(numberresult - 2) != '\0');
2434 assert(numprinted >= 0);
2435 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002436 break;
2437 case 's':
2438 {
2439 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002440 const char *s = va_arg(count, const char*);
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002441 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002442 if (!str)
2443 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002444 /* since PyUnicode_DecodeUTF8 returns already flexible
2445 unicode objects, there is no need to call ready on them */
2446 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002447 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002448 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002449 /* Remember the str and switch to the next slot */
2450 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002451 break;
2452 }
2453 case 'U':
2454 {
2455 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002456 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002457 if (PyUnicode_READY(obj) == -1)
2458 goto fail;
2459 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002460 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002461 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002462 break;
2463 }
2464 case 'V':
2465 {
2466 PyObject *obj = va_arg(count, PyObject *);
2467 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002468 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002469 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002470 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002471 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002472 if (PyUnicode_READY(obj) == -1)
2473 goto fail;
2474 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002475 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002476 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002477 *callresult++ = NULL;
2478 }
2479 else {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002480 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002481 if (!str_obj)
2482 goto fail;
Victor Stinnere1335c72011-10-04 20:53:03 +02002483 if (PyUnicode_READY(str_obj)) {
2484 Py_DECREF(str_obj);
2485 goto fail;
2486 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002487 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002488 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002489 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002490 *callresult++ = str_obj;
2491 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002492 break;
2493 }
2494 case 'S':
2495 {
2496 PyObject *obj = va_arg(count, PyObject *);
2497 PyObject *str;
2498 assert(obj);
2499 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002500 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002501 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002502 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002503 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002504 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002505 /* Remember the str and switch to the next slot */
2506 *callresult++ = str;
2507 break;
2508 }
2509 case 'R':
2510 {
2511 PyObject *obj = va_arg(count, PyObject *);
2512 PyObject *repr;
2513 assert(obj);
2514 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002515 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002516 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002517 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002518 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002519 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002520 /* Remember the repr and switch to the next slot */
2521 *callresult++ = repr;
2522 break;
2523 }
2524 case 'A':
2525 {
2526 PyObject *obj = va_arg(count, PyObject *);
2527 PyObject *ascii;
2528 assert(obj);
2529 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002530 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002531 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002532 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002533 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002534 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002535 /* Remember the repr and switch to the next slot */
2536 *callresult++ = ascii;
2537 break;
2538 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002539 default:
2540 /* if we stumble upon an unknown
2541 formatting code, copy the rest of
2542 the format string to the output
2543 string. (we cannot just skip the
2544 code, since there's no way to know
2545 what's in the argument list) */
2546 n += strlen(p);
2547 goto expand;
2548 }
2549 } else
2550 n++;
2551 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002552 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002553 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002554 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002555 we don't have to resize the string.
2556 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002557 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002558 if (!string)
2559 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002560 kind = PyUnicode_KIND(string);
2561 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002562 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002563 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002564
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002565 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002566 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002567 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002568
2569 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002570 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2571 /* checking for == because the last argument could be a empty
2572 string, which causes i to point to end, the assert at the end of
2573 the loop */
2574 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002575
Benjamin Peterson14339b62009-01-31 16:36:08 +00002576 switch (*f) {
2577 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002578 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002579 const int ordinal = va_arg(vargs, int);
2580 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002581 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002582 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002583 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002584 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002585 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002586 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002587 case 'p':
2588 /* unused, since we already have the result */
2589 if (*f == 'p')
2590 (void) va_arg(vargs, void *);
2591 else
2592 (void) va_arg(vargs, int);
2593 /* extract the result from numberresults and append. */
2594 for (; *numberresult; ++i, ++numberresult)
2595 PyUnicode_WRITE(kind, data, i, *numberresult);
2596 /* skip over the separating '\0' */
2597 assert(*numberresult == '\0');
2598 numberresult++;
2599 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002600 break;
2601 case 's':
2602 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002603 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002604 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002605 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002606 size = PyUnicode_GET_LENGTH(*callresult);
2607 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002608 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002609 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002610 /* We're done with the unicode()/repr() => forget it */
2611 Py_DECREF(*callresult);
2612 /* switch to next unicode()/repr() result */
2613 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002614 break;
2615 }
2616 case 'U':
2617 {
2618 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002619 Py_ssize_t size;
2620 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2621 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002622 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002623 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002624 break;
2625 }
2626 case 'V':
2627 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002628 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002629 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002630 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002631 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002632 size = PyUnicode_GET_LENGTH(obj);
2633 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002634 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002635 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002636 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002637 size = PyUnicode_GET_LENGTH(*callresult);
2638 assert(PyUnicode_KIND(*callresult) <=
2639 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002640 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002641 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002642 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002643 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002644 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002645 break;
2646 }
2647 case 'S':
2648 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002649 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002650 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002651 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002652 /* unused, since we already have the result */
2653 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002654 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002655 copy_characters(string, i, *callresult, 0, size);
2656 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002657 /* We're done with the unicode()/repr() => forget it */
2658 Py_DECREF(*callresult);
2659 /* switch to next unicode()/repr() result */
2660 ++callresult;
2661 break;
2662 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002663 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002664 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002665 break;
2666 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002667 for (; *p; ++p, ++i)
2668 PyUnicode_WRITE(kind, data, i, *p);
2669 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002670 goto end;
2671 }
Victor Stinner1205f272010-09-11 00:54:47 +00002672 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002673 else {
2674 assert(i < PyUnicode_GET_LENGTH(string));
2675 PyUnicode_WRITE(kind, data, i++, *f);
2676 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002677 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002678 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002679
Benjamin Peterson29060642009-01-31 22:14:21 +00002680 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002681 if (callresults)
2682 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002683 if (numberresults)
2684 PyObject_Free(numberresults);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002685 return unicode_result(string);
Benjamin Peterson29060642009-01-31 22:14:21 +00002686 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002687 if (callresults) {
2688 PyObject **callresult2 = callresults;
2689 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002690 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002691 ++callresult2;
2692 }
2693 PyObject_Free(callresults);
2694 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002695 if (numberresults)
2696 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002697 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002698}
2699
Walter Dörwaldd2034312007-05-18 16:29:38 +00002700PyObject *
2701PyUnicode_FromFormat(const char *format, ...)
2702{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002703 PyObject* ret;
2704 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002705
2706#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002707 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002708#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002709 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002710#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002711 ret = PyUnicode_FromFormatV(format, vargs);
2712 va_end(vargs);
2713 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002714}
2715
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002716#ifdef HAVE_WCHAR_H
2717
Victor Stinner5593d8a2010-10-02 11:11:27 +00002718/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2719 convert a Unicode object to a wide character string.
2720
Victor Stinnerd88d9832011-09-06 02:00:05 +02002721 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002722 character) required to convert the unicode object. Ignore size argument.
2723
Victor Stinnerd88d9832011-09-06 02:00:05 +02002724 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002725 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002726 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002727static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002728unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002729 wchar_t *w,
2730 Py_ssize_t size)
2731{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002732 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002733 const wchar_t *wstr;
2734
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002735 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002736 if (wstr == NULL)
2737 return -1;
2738
Victor Stinner5593d8a2010-10-02 11:11:27 +00002739 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002740 if (size > res)
2741 size = res + 1;
2742 else
2743 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002744 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002745 return res;
2746 }
2747 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002748 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002749}
2750
2751Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002752PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002753 wchar_t *w,
2754 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002755{
2756 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002757 PyErr_BadInternalCall();
2758 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002759 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002760 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002761}
2762
Victor Stinner137c34c2010-09-29 10:25:54 +00002763wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002764PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002765 Py_ssize_t *size)
2766{
2767 wchar_t* buffer;
2768 Py_ssize_t buflen;
2769
2770 if (unicode == NULL) {
2771 PyErr_BadInternalCall();
2772 return NULL;
2773 }
2774
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002775 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002776 if (buflen == -1)
2777 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002778 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002779 PyErr_NoMemory();
2780 return NULL;
2781 }
2782
Victor Stinner137c34c2010-09-29 10:25:54 +00002783 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2784 if (buffer == NULL) {
2785 PyErr_NoMemory();
2786 return NULL;
2787 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002788 buflen = unicode_aswidechar(unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002789 if (buflen == -1)
2790 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002791 if (size != NULL)
2792 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002793 return buffer;
2794}
2795
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002796#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002797
Alexander Belopolsky40018472011-02-26 01:02:56 +00002798PyObject *
2799PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002800{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002801 PyObject *v;
Victor Stinner8faf8212011-12-08 22:14:11 +01002802 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002803 PyErr_SetString(PyExc_ValueError,
2804 "chr() arg not in range(0x110000)");
2805 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002806 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002807
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002808 if (ordinal < 256)
2809 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002810
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002811 v = PyUnicode_New(1, ordinal);
2812 if (v == NULL)
2813 return NULL;
2814 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002815 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002816 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002817}
2818
Alexander Belopolsky40018472011-02-26 01:02:56 +00002819PyObject *
2820PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002821{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002822 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002823 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002824 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002825 if (PyUnicode_READY(obj))
2826 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002827 Py_INCREF(obj);
2828 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002829 }
2830 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002831 /* For a Unicode subtype that's not a Unicode object,
2832 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002833 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002834 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002835 PyErr_Format(PyExc_TypeError,
2836 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002837 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002838 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002839}
2840
Alexander Belopolsky40018472011-02-26 01:02:56 +00002841PyObject *
2842PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002843 const char *encoding,
2844 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002845{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002846 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002847 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002848
Guido van Rossumd57fd912000-03-10 22:53:23 +00002849 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002850 PyErr_BadInternalCall();
2851 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002852 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002853
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002854 /* Decoding bytes objects is the most common case and should be fast */
2855 if (PyBytes_Check(obj)) {
2856 if (PyBytes_GET_SIZE(obj) == 0) {
2857 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002858 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002859 }
2860 else {
2861 v = PyUnicode_Decode(
2862 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2863 encoding, errors);
2864 }
2865 return v;
2866 }
2867
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002868 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002869 PyErr_SetString(PyExc_TypeError,
2870 "decoding str is not supported");
2871 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002872 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002873
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002874 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2875 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2876 PyErr_Format(PyExc_TypeError,
2877 "coercing to str: need bytes, bytearray "
2878 "or buffer-like object, %.80s found",
2879 Py_TYPE(obj)->tp_name);
2880 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002881 }
Tim Petersced69f82003-09-16 20:30:58 +00002882
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002883 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002884 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002885 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002886 }
Tim Petersced69f82003-09-16 20:30:58 +00002887 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002888 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002889
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002890 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002891 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002892}
2893
Victor Stinner600d3be2010-06-10 12:00:55 +00002894/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002895 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2896 1 on success. */
2897static int
2898normalize_encoding(const char *encoding,
2899 char *lower,
2900 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002901{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002902 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002903 char *l;
2904 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002905
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002906 if (encoding == NULL) {
2907 strcpy(lower, "utf-8");
2908 return 1;
2909 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002910 e = encoding;
2911 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002912 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002913 while (*e) {
2914 if (l == l_end)
2915 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002916 if (Py_ISUPPER(*e)) {
2917 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002918 }
2919 else if (*e == '_') {
2920 *l++ = '-';
2921 e++;
2922 }
2923 else {
2924 *l++ = *e++;
2925 }
2926 }
2927 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002928 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002929}
2930
Alexander Belopolsky40018472011-02-26 01:02:56 +00002931PyObject *
2932PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002933 Py_ssize_t size,
2934 const char *encoding,
2935 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002936{
2937 PyObject *buffer = NULL, *unicode;
2938 Py_buffer info;
2939 char lower[11]; /* Enough for any encoding shortcut */
2940
Fred Drakee4315f52000-05-09 19:53:39 +00002941 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002942 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002943 if ((strcmp(lower, "utf-8") == 0) ||
2944 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002945 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00002946 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002947 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002948 (strcmp(lower, "iso-8859-1") == 0))
2949 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002950#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002951 else if (strcmp(lower, "mbcs") == 0)
2952 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002953#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002954 else if (strcmp(lower, "ascii") == 0)
2955 return PyUnicode_DecodeASCII(s, size, errors);
2956 else if (strcmp(lower, "utf-16") == 0)
2957 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2958 else if (strcmp(lower, "utf-32") == 0)
2959 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2960 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002961
2962 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002963 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002964 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002965 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002966 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002967 if (buffer == NULL)
2968 goto onError;
2969 unicode = PyCodec_Decode(buffer, encoding, errors);
2970 if (unicode == NULL)
2971 goto onError;
2972 if (!PyUnicode_Check(unicode)) {
2973 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002974 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002975 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002976 Py_DECREF(unicode);
2977 goto onError;
2978 }
2979 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002980 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00002981
Benjamin Peterson29060642009-01-31 22:14:21 +00002982 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002983 Py_XDECREF(buffer);
2984 return NULL;
2985}
2986
Alexander Belopolsky40018472011-02-26 01:02:56 +00002987PyObject *
2988PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002989 const char *encoding,
2990 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002991{
2992 PyObject *v;
2993
2994 if (!PyUnicode_Check(unicode)) {
2995 PyErr_BadArgument();
2996 goto onError;
2997 }
2998
2999 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003000 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003001
3002 /* Decode via the codec registry */
3003 v = PyCodec_Decode(unicode, encoding, errors);
3004 if (v == NULL)
3005 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003006 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003007
Benjamin Peterson29060642009-01-31 22:14:21 +00003008 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003009 return NULL;
3010}
3011
Alexander Belopolsky40018472011-02-26 01:02:56 +00003012PyObject *
3013PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003014 const char *encoding,
3015 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003016{
3017 PyObject *v;
3018
3019 if (!PyUnicode_Check(unicode)) {
3020 PyErr_BadArgument();
3021 goto onError;
3022 }
3023
3024 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003025 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003026
3027 /* Decode via the codec registry */
3028 v = PyCodec_Decode(unicode, encoding, errors);
3029 if (v == NULL)
3030 goto onError;
3031 if (!PyUnicode_Check(v)) {
3032 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003033 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003034 Py_TYPE(v)->tp_name);
3035 Py_DECREF(v);
3036 goto onError;
3037 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003038 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003039
Benjamin Peterson29060642009-01-31 22:14:21 +00003040 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003041 return NULL;
3042}
3043
Alexander Belopolsky40018472011-02-26 01:02:56 +00003044PyObject *
3045PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003046 Py_ssize_t size,
3047 const char *encoding,
3048 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003049{
3050 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003051
Guido van Rossumd57fd912000-03-10 22:53:23 +00003052 unicode = PyUnicode_FromUnicode(s, size);
3053 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003054 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003055 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3056 Py_DECREF(unicode);
3057 return v;
3058}
3059
Alexander Belopolsky40018472011-02-26 01:02:56 +00003060PyObject *
3061PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003062 const char *encoding,
3063 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003064{
3065 PyObject *v;
3066
3067 if (!PyUnicode_Check(unicode)) {
3068 PyErr_BadArgument();
3069 goto onError;
3070 }
3071
3072 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003073 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003074
3075 /* Encode via the codec registry */
3076 v = PyCodec_Encode(unicode, encoding, errors);
3077 if (v == NULL)
3078 goto onError;
3079 return v;
3080
Benjamin Peterson29060642009-01-31 22:14:21 +00003081 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003082 return NULL;
3083}
3084
Victor Stinnerad158722010-10-27 00:25:46 +00003085PyObject *
3086PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003087{
Victor Stinner99b95382011-07-04 14:23:54 +02003088#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003089 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003090#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003091 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003092#else
Victor Stinner793b5312011-04-27 00:24:21 +02003093 PyInterpreterState *interp = PyThreadState_GET()->interp;
3094 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3095 cannot use it to encode and decode filenames before it is loaded. Load
3096 the Python codec requires to encode at least its own filename. Use the C
3097 version of the locale codec until the codec registry is initialized and
3098 the Python codec is loaded.
3099
3100 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3101 cannot only rely on it: check also interp->fscodec_initialized for
3102 subinterpreters. */
3103 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003104 return PyUnicode_AsEncodedString(unicode,
3105 Py_FileSystemDefaultEncoding,
3106 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003107 }
3108 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003109 /* locale encoding with surrogateescape */
3110 wchar_t *wchar;
3111 char *bytes;
3112 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00003113 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003114
3115 wchar = PyUnicode_AsWideCharString(unicode, NULL);
3116 if (wchar == NULL)
3117 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003118 bytes = _Py_wchar2char(wchar, &error_pos);
3119 if (bytes == NULL) {
3120 if (error_pos != (size_t)-1) {
3121 char *errmsg = strerror(errno);
3122 PyObject *exc = NULL;
3123 if (errmsg == NULL)
3124 errmsg = "Py_wchar2char() failed";
3125 raise_encode_exception(&exc,
Martin v. Löwis12be46c2011-11-04 19:04:15 +01003126 "filesystemencoding", unicode,
Victor Stinner2f02a512010-11-08 22:43:46 +00003127 error_pos, error_pos+1,
3128 errmsg);
3129 Py_XDECREF(exc);
3130 }
3131 else
3132 PyErr_NoMemory();
3133 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003134 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003135 }
3136 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003137
3138 bytes_obj = PyBytes_FromString(bytes);
3139 PyMem_Free(bytes);
3140 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00003141 }
Victor Stinnerad158722010-10-27 00:25:46 +00003142#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003143}
3144
Alexander Belopolsky40018472011-02-26 01:02:56 +00003145PyObject *
3146PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003147 const char *encoding,
3148 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003149{
3150 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003151 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003152
Guido van Rossumd57fd912000-03-10 22:53:23 +00003153 if (!PyUnicode_Check(unicode)) {
3154 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003155 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003156 }
Fred Drakee4315f52000-05-09 19:53:39 +00003157
Fred Drakee4315f52000-05-09 19:53:39 +00003158 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003159 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003160 if ((strcmp(lower, "utf-8") == 0) ||
3161 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003162 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003163 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003164 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003165 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003166 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003167 }
Victor Stinner37296e82010-06-10 13:36:23 +00003168 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003169 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003170 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003171 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003172#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003173 else if (strcmp(lower, "mbcs") == 0)
3174 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003175#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003176 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003177 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003178 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003179
3180 /* Encode via the codec registry */
3181 v = PyCodec_Encode(unicode, encoding, errors);
3182 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003183 return NULL;
3184
3185 /* The normal path */
3186 if (PyBytes_Check(v))
3187 return v;
3188
3189 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003190 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003191 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003192 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003193
3194 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3195 "encoder %s returned bytearray instead of bytes",
3196 encoding);
3197 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003198 Py_DECREF(v);
3199 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003200 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003201
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003202 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3203 Py_DECREF(v);
3204 return b;
3205 }
3206
3207 PyErr_Format(PyExc_TypeError,
3208 "encoder did not return a bytes object (type=%.400s)",
3209 Py_TYPE(v)->tp_name);
3210 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003211 return NULL;
3212}
3213
Alexander Belopolsky40018472011-02-26 01:02:56 +00003214PyObject *
3215PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003216 const char *encoding,
3217 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003218{
3219 PyObject *v;
3220
3221 if (!PyUnicode_Check(unicode)) {
3222 PyErr_BadArgument();
3223 goto onError;
3224 }
3225
3226 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003227 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003228
3229 /* Encode via the codec registry */
3230 v = PyCodec_Encode(unicode, encoding, errors);
3231 if (v == NULL)
3232 goto onError;
3233 if (!PyUnicode_Check(v)) {
3234 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003235 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003236 Py_TYPE(v)->tp_name);
3237 Py_DECREF(v);
3238 goto onError;
3239 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003240 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003241
Benjamin Peterson29060642009-01-31 22:14:21 +00003242 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003243 return NULL;
3244}
3245
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003246PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003247PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003248 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003249 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3250}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003251
Christian Heimes5894ba72007-11-04 11:43:14 +00003252PyObject*
3253PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3254{
Victor Stinner99b95382011-07-04 14:23:54 +02003255#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003256 return PyUnicode_DecodeMBCS(s, size, NULL);
3257#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003258 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003259#else
Victor Stinner793b5312011-04-27 00:24:21 +02003260 PyInterpreterState *interp = PyThreadState_GET()->interp;
3261 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3262 cannot use it to encode and decode filenames before it is loaded. Load
3263 the Python codec requires to encode at least its own filename. Use the C
3264 version of the locale codec until the codec registry is initialized and
3265 the Python codec is loaded.
3266
3267 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3268 cannot only rely on it: check also interp->fscodec_initialized for
3269 subinterpreters. */
3270 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003271 return PyUnicode_Decode(s, size,
3272 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003273 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003274 }
3275 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003276 /* locale encoding with surrogateescape */
3277 wchar_t *wchar;
3278 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00003279 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003280
3281 if (s[size] != '\0' || size != strlen(s)) {
3282 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3283 return NULL;
3284 }
3285
Victor Stinner168e1172010-10-16 23:16:16 +00003286 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003287 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00003288 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003289
Victor Stinner168e1172010-10-16 23:16:16 +00003290 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003291 PyMem_Free(wchar);
3292 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003293 }
Victor Stinnerad158722010-10-27 00:25:46 +00003294#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003295}
3296
Martin v. Löwis011e8422009-05-05 04:43:17 +00003297
3298int
3299PyUnicode_FSConverter(PyObject* arg, void* addr)
3300{
3301 PyObject *output = NULL;
3302 Py_ssize_t size;
3303 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003304 if (arg == NULL) {
3305 Py_DECREF(*(PyObject**)addr);
3306 return 1;
3307 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003308 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003309 output = arg;
3310 Py_INCREF(output);
3311 }
3312 else {
3313 arg = PyUnicode_FromObject(arg);
3314 if (!arg)
3315 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003316 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003317 Py_DECREF(arg);
3318 if (!output)
3319 return 0;
3320 if (!PyBytes_Check(output)) {
3321 Py_DECREF(output);
3322 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3323 return 0;
3324 }
3325 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003326 size = PyBytes_GET_SIZE(output);
3327 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003328 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003329 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003330 Py_DECREF(output);
3331 return 0;
3332 }
3333 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003334 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003335}
3336
3337
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003338int
3339PyUnicode_FSDecoder(PyObject* arg, void* addr)
3340{
3341 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003342 if (arg == NULL) {
3343 Py_DECREF(*(PyObject**)addr);
3344 return 1;
3345 }
3346 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003347 if (PyUnicode_READY(arg))
3348 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003349 output = arg;
3350 Py_INCREF(output);
3351 }
3352 else {
3353 arg = PyBytes_FromObject(arg);
3354 if (!arg)
3355 return 0;
3356 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3357 PyBytes_GET_SIZE(arg));
3358 Py_DECREF(arg);
3359 if (!output)
3360 return 0;
3361 if (!PyUnicode_Check(output)) {
3362 Py_DECREF(output);
3363 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3364 return 0;
3365 }
3366 }
Victor Stinner065836e2011-10-27 01:56:33 +02003367 if (PyUnicode_READY(output) < 0) {
3368 Py_DECREF(output);
3369 return 0;
3370 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003371 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003372 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003373 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3374 Py_DECREF(output);
3375 return 0;
3376 }
3377 *(PyObject**)addr = output;
3378 return Py_CLEANUP_SUPPORTED;
3379}
3380
3381
Martin v. Löwis5b222132007-06-10 09:51:05 +00003382char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003383PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003384{
Christian Heimesf3863112007-11-22 07:46:41 +00003385 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003386
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003387 if (!PyUnicode_Check(unicode)) {
3388 PyErr_BadArgument();
3389 return NULL;
3390 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003391 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003392 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003393
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003394 if (PyUnicode_UTF8(unicode) == NULL) {
3395 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003396 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3397 if (bytes == NULL)
3398 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003399 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3400 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003401 Py_DECREF(bytes);
3402 return NULL;
3403 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003404 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3405 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3406 PyBytes_AS_STRING(bytes),
3407 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003408 Py_DECREF(bytes);
3409 }
3410
3411 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003412 *psize = PyUnicode_UTF8_LENGTH(unicode);
3413 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003414}
3415
3416char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003417PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003418{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003419 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3420}
3421
3422#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02003423static int unicode_as_unicode_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003424#endif
3425
3426
3427Py_UNICODE *
3428PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3429{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003430 const unsigned char *one_byte;
3431#if SIZEOF_WCHAR_T == 4
3432 const Py_UCS2 *two_bytes;
3433#else
3434 const Py_UCS4 *four_bytes;
3435 const Py_UCS4 *ucs4_end;
3436 Py_ssize_t num_surrogates;
3437#endif
3438 wchar_t *w;
3439 wchar_t *wchar_end;
3440
3441 if (!PyUnicode_Check(unicode)) {
3442 PyErr_BadArgument();
3443 return NULL;
3444 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003445 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003446 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003447 assert(_PyUnicode_KIND(unicode) != 0);
3448 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003449
3450#ifdef Py_DEBUG
3451 ++unicode_as_unicode_calls;
3452#endif
3453
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003454 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003455#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003456 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3457 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003458 num_surrogates = 0;
3459
3460 for (; four_bytes < ucs4_end; ++four_bytes) {
3461 if (*four_bytes > 0xFFFF)
3462 ++num_surrogates;
3463 }
3464
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003465 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3466 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3467 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003468 PyErr_NoMemory();
3469 return NULL;
3470 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003471 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003472
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003473 w = _PyUnicode_WSTR(unicode);
3474 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3475 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003476 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3477 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003478 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003479 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003480 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3481 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003482 }
3483 else
3484 *w = *four_bytes;
3485
3486 if (w > wchar_end) {
3487 assert(0 && "Miscalculated string end");
3488 }
3489 }
3490 *w = 0;
3491#else
3492 /* sizeof(wchar_t) == 4 */
3493 Py_FatalError("Impossible unicode object state, wstr and str "
3494 "should share memory already.");
3495 return NULL;
3496#endif
3497 }
3498 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003499 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3500 (_PyUnicode_LENGTH(unicode) + 1));
3501 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003502 PyErr_NoMemory();
3503 return NULL;
3504 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003505 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3506 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3507 w = _PyUnicode_WSTR(unicode);
3508 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003509
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003510 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3511 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003512 for (; w < wchar_end; ++one_byte, ++w)
3513 *w = *one_byte;
3514 /* null-terminate the wstr */
3515 *w = 0;
3516 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003517 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003518#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003519 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003520 for (; w < wchar_end; ++two_bytes, ++w)
3521 *w = *two_bytes;
3522 /* null-terminate the wstr */
3523 *w = 0;
3524#else
3525 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003526 PyObject_FREE(_PyUnicode_WSTR(unicode));
3527 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003528 Py_FatalError("Impossible unicode object state, wstr "
3529 "and str should share memory already.");
3530 return NULL;
3531#endif
3532 }
3533 else {
3534 assert(0 && "This should never happen.");
3535 }
3536 }
3537 }
3538 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003539 *size = PyUnicode_WSTR_LENGTH(unicode);
3540 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003541}
3542
Alexander Belopolsky40018472011-02-26 01:02:56 +00003543Py_UNICODE *
3544PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003545{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003546 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003547}
3548
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003549
Alexander Belopolsky40018472011-02-26 01:02:56 +00003550Py_ssize_t
3551PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003552{
3553 if (!PyUnicode_Check(unicode)) {
3554 PyErr_BadArgument();
3555 goto onError;
3556 }
3557 return PyUnicode_GET_SIZE(unicode);
3558
Benjamin Peterson29060642009-01-31 22:14:21 +00003559 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003560 return -1;
3561}
3562
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003563Py_ssize_t
3564PyUnicode_GetLength(PyObject *unicode)
3565{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003566 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003567 PyErr_BadArgument();
3568 return -1;
3569 }
3570
3571 return PyUnicode_GET_LENGTH(unicode);
3572}
3573
3574Py_UCS4
3575PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3576{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003577 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3578 PyErr_BadArgument();
3579 return (Py_UCS4)-1;
3580 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003581 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003582 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003583 return (Py_UCS4)-1;
3584 }
3585 return PyUnicode_READ_CHAR(unicode, index);
3586}
3587
3588int
3589PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3590{
3591 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003592 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003593 return -1;
3594 }
Victor Stinner488fa492011-12-12 00:01:39 +01003595 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003596 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003597 PyErr_SetString(PyExc_IndexError, "string index out of range");
3598 return -1;
3599 }
Victor Stinner488fa492011-12-12 00:01:39 +01003600 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003601 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003602 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3603 index, ch);
3604 return 0;
3605}
3606
Alexander Belopolsky40018472011-02-26 01:02:56 +00003607const char *
3608PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003609{
Victor Stinner42cb4622010-09-01 19:39:01 +00003610 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003611}
3612
Victor Stinner554f3f02010-06-16 23:33:54 +00003613/* create or adjust a UnicodeDecodeError */
3614static void
3615make_decode_exception(PyObject **exceptionObject,
3616 const char *encoding,
3617 const char *input, Py_ssize_t length,
3618 Py_ssize_t startpos, Py_ssize_t endpos,
3619 const char *reason)
3620{
3621 if (*exceptionObject == NULL) {
3622 *exceptionObject = PyUnicodeDecodeError_Create(
3623 encoding, input, length, startpos, endpos, reason);
3624 }
3625 else {
3626 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3627 goto onError;
3628 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3629 goto onError;
3630 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3631 goto onError;
3632 }
3633 return;
3634
3635onError:
3636 Py_DECREF(*exceptionObject);
3637 *exceptionObject = NULL;
3638}
3639
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003640/* error handling callback helper:
3641 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003642 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003643 and adjust various state variables.
3644 return 0 on success, -1 on error
3645*/
3646
Alexander Belopolsky40018472011-02-26 01:02:56 +00003647static int
3648unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003649 const char *encoding, const char *reason,
3650 const char **input, const char **inend, Py_ssize_t *startinpos,
3651 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003652 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003653{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003654 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003655
3656 PyObject *restuple = NULL;
3657 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01003658 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003659 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003660 Py_ssize_t requiredsize;
3661 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003662 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003663 int res = -1;
3664
Victor Stinner596a6c42011-11-09 00:02:18 +01003665 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
3666 outsize = PyUnicode_GET_LENGTH(*output);
3667 else
3668 outsize = _PyUnicode_WSTR_LENGTH(*output);
3669
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003670 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003671 *errorHandler = PyCodec_LookupError(errors);
3672 if (*errorHandler == NULL)
3673 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003674 }
3675
Victor Stinner554f3f02010-06-16 23:33:54 +00003676 make_decode_exception(exceptionObject,
3677 encoding,
3678 *input, *inend - *input,
3679 *startinpos, *endinpos,
3680 reason);
3681 if (*exceptionObject == NULL)
3682 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003683
3684 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3685 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003686 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003687 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003688 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003689 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003690 }
3691 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003692 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003693 if (PyUnicode_READY(repunicode) < 0)
3694 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003695
3696 /* Copy back the bytes variables, which might have been modified by the
3697 callback */
3698 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3699 if (!inputobj)
3700 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003701 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003702 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003703 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003704 *input = PyBytes_AS_STRING(inputobj);
3705 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003706 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003707 /* we can DECREF safely, as the exception has another reference,
3708 so the object won't go away. */
3709 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003710
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003711 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003712 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003713 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003714 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3715 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003716 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003717
Victor Stinner596a6c42011-11-09 00:02:18 +01003718 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
3719 /* need more space? (at least enough for what we
3720 have+the replacement+the rest of the string (starting
3721 at the new input position), so we won't have to check space
3722 when there are no errors in the rest of the string) */
3723 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
3724 requiredsize = *outpos + replen + insize-newpos;
3725 if (requiredsize > outsize) {
3726 if (requiredsize<2*outsize)
3727 requiredsize = 2*outsize;
3728 if (unicode_resize(output, requiredsize) < 0)
3729 goto onError;
3730 }
3731 if (unicode_widen(output, PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003732 goto onError;
Victor Stinner596a6c42011-11-09 00:02:18 +01003733 copy_characters(*output, *outpos, repunicode, 0, replen);
3734 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003735 }
Victor Stinner596a6c42011-11-09 00:02:18 +01003736 else {
3737 wchar_t *repwstr;
3738 Py_ssize_t repwlen;
3739 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
3740 if (repwstr == NULL)
3741 goto onError;
3742 /* need more space? (at least enough for what we
3743 have+the replacement+the rest of the string (starting
3744 at the new input position), so we won't have to check space
3745 when there are no errors in the rest of the string) */
3746 requiredsize = *outpos + repwlen + insize-newpos;
3747 if (requiredsize > outsize) {
3748 if (requiredsize < 2*outsize)
3749 requiredsize = 2*outsize;
3750 if (unicode_resize(output, requiredsize) < 0)
3751 goto onError;
3752 }
3753 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
3754 *outpos += repwlen;
3755 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003756 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003757 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003758
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003759 /* we made it! */
3760 res = 0;
3761
Benjamin Peterson29060642009-01-31 22:14:21 +00003762 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003763 Py_XDECREF(restuple);
3764 return res;
3765}
3766
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003767/* --- UTF-7 Codec -------------------------------------------------------- */
3768
Antoine Pitrou244651a2009-05-04 18:56:13 +00003769/* See RFC2152 for details. We encode conservatively and decode liberally. */
3770
3771/* Three simple macros defining base-64. */
3772
3773/* Is c a base-64 character? */
3774
3775#define IS_BASE64(c) \
3776 (((c) >= 'A' && (c) <= 'Z') || \
3777 ((c) >= 'a' && (c) <= 'z') || \
3778 ((c) >= '0' && (c) <= '9') || \
3779 (c) == '+' || (c) == '/')
3780
3781/* given that c is a base-64 character, what is its base-64 value? */
3782
3783#define FROM_BASE64(c) \
3784 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3785 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3786 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3787 (c) == '+' ? 62 : 63)
3788
3789/* What is the base-64 character of the bottom 6 bits of n? */
3790
3791#define TO_BASE64(n) \
3792 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3793
3794/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3795 * decoded as itself. We are permissive on decoding; the only ASCII
3796 * byte not decoding to itself is the + which begins a base64
3797 * string. */
3798
3799#define DECODE_DIRECT(c) \
3800 ((c) <= 127 && (c) != '+')
3801
3802/* The UTF-7 encoder treats ASCII characters differently according to
3803 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3804 * the above). See RFC2152. This array identifies these different
3805 * sets:
3806 * 0 : "Set D"
3807 * alphanumeric and '(),-./:?
3808 * 1 : "Set O"
3809 * !"#$%&*;<=>@[]^_`{|}
3810 * 2 : "whitespace"
3811 * ht nl cr sp
3812 * 3 : special (must be base64 encoded)
3813 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3814 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003815
Tim Petersced69f82003-09-16 20:30:58 +00003816static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003817char utf7_category[128] = {
3818/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3819 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3820/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3821 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3822/* sp ! " # $ % & ' ( ) * + , - . / */
3823 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3824/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3825 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3826/* @ A B C D E F G H I J K L M N O */
3827 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3828/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3829 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3830/* ` a b c d e f g h i j k l m n o */
3831 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3832/* p q r s t u v w x y z { | } ~ del */
3833 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003834};
3835
Antoine Pitrou244651a2009-05-04 18:56:13 +00003836/* ENCODE_DIRECT: this character should be encoded as itself. The
3837 * answer depends on whether we are encoding set O as itself, and also
3838 * on whether we are encoding whitespace as itself. RFC2152 makes it
3839 * clear that the answers to these questions vary between
3840 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003841
Antoine Pitrou244651a2009-05-04 18:56:13 +00003842#define ENCODE_DIRECT(c, directO, directWS) \
3843 ((c) < 128 && (c) > 0 && \
3844 ((utf7_category[(c)] == 0) || \
3845 (directWS && (utf7_category[(c)] == 2)) || \
3846 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003847
Alexander Belopolsky40018472011-02-26 01:02:56 +00003848PyObject *
3849PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003850 Py_ssize_t size,
3851 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003852{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003853 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3854}
3855
Antoine Pitrou244651a2009-05-04 18:56:13 +00003856/* The decoder. The only state we preserve is our read position,
3857 * i.e. how many characters we have consumed. So if we end in the
3858 * middle of a shift sequence we have to back off the read position
3859 * and the output to the beginning of the sequence, otherwise we lose
3860 * all the shift state (seen bits, number of bits seen, high
3861 * surrogate). */
3862
Alexander Belopolsky40018472011-02-26 01:02:56 +00003863PyObject *
3864PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003865 Py_ssize_t size,
3866 const char *errors,
3867 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003868{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003869 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003870 Py_ssize_t startinpos;
3871 Py_ssize_t endinpos;
3872 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003873 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01003874 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003875 const char *errmsg = "";
3876 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003877 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003878 unsigned int base64bits = 0;
3879 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01003880 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003881 PyObject *errorHandler = NULL;
3882 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003883
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003884 /* Start off assuming it's all ASCII. Widen later as necessary. */
3885 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003886 if (!unicode)
3887 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003888 if (size == 0) {
3889 if (consumed)
3890 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01003891 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003892 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003893
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003894 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003895 e = s + size;
3896
3897 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003898 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003899 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003900 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003901
Antoine Pitrou244651a2009-05-04 18:56:13 +00003902 if (inShift) { /* in a base-64 section */
3903 if (IS_BASE64(ch)) { /* consume a base-64 character */
3904 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3905 base64bits += 6;
3906 s++;
3907 if (base64bits >= 16) {
3908 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01003909 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00003910 base64bits -= 16;
3911 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3912 if (surrogate) {
3913 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01003914 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
3915 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003916 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
3917 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003918 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01003919 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003920 }
3921 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01003922 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
3923 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003924 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003925 }
3926 }
Victor Stinner551ac952011-11-29 22:58:13 +01003927 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003928 /* first surrogate */
3929 surrogate = outCh;
3930 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003931 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003932 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
3933 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003934 }
3935 }
3936 }
3937 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003938 inShift = 0;
3939 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003940 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01003941 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
3942 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01003943 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003944 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003945 if (base64bits > 0) { /* left-over bits */
3946 if (base64bits >= 6) {
3947 /* We've seen at least one base-64 character */
3948 errmsg = "partial character in shift sequence";
3949 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003950 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003951 else {
3952 /* Some bits remain; they should be zero */
3953 if (base64buffer != 0) {
3954 errmsg = "non-zero padding bits in shift sequence";
3955 goto utf7Error;
3956 }
3957 }
3958 }
3959 if (ch != '-') {
3960 /* '-' is absorbed; other terminating
3961 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003962 if (unicode_putchar(&unicode, &outpos, ch) < 0)
3963 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003964 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003965 }
3966 }
3967 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003968 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003969 s++; /* consume '+' */
3970 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003971 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003972 if (unicode_putchar(&unicode, &outpos, '+') < 0)
3973 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003974 }
3975 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003976 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003977 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003978 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003979 }
3980 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003981 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003982 if (unicode_putchar(&unicode, &outpos, ch) < 0)
3983 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003984 s++;
3985 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003986 else {
3987 startinpos = s-starts;
3988 s++;
3989 errmsg = "unexpected special character";
3990 goto utf7Error;
3991 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003992 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003993utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003994 endinpos = s-starts;
3995 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003996 errors, &errorHandler,
3997 "utf7", errmsg,
3998 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003999 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004000 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004001 }
4002
Antoine Pitrou244651a2009-05-04 18:56:13 +00004003 /* end of string */
4004
4005 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4006 /* if we're in an inconsistent state, that's an error */
4007 if (surrogate ||
4008 (base64bits >= 6) ||
4009 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004010 endinpos = size;
4011 if (unicode_decode_call_errorhandler(
4012 errors, &errorHandler,
4013 "utf7", "unterminated shift sequence",
4014 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004015 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004016 goto onError;
4017 if (s < e)
4018 goto restart;
4019 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004020 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004021
4022 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004023 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004024 if (inShift) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004025 outpos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004026 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004027 }
4028 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004029 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004030 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004031 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004032
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004033 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004034 goto onError;
4035
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004036 Py_XDECREF(errorHandler);
4037 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004038 return unicode_result(unicode);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004039
Benjamin Peterson29060642009-01-31 22:14:21 +00004040 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004041 Py_XDECREF(errorHandler);
4042 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004043 Py_DECREF(unicode);
4044 return NULL;
4045}
4046
4047
Alexander Belopolsky40018472011-02-26 01:02:56 +00004048PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004049_PyUnicode_EncodeUTF7(PyObject *str,
4050 int base64SetO,
4051 int base64WhiteSpace,
4052 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004053{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004054 int kind;
4055 void *data;
4056 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004057 PyObject *v;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004058 Py_ssize_t allocated;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004059 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004060 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004061 unsigned int base64bits = 0;
4062 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004063 char * out;
4064 char * start;
4065
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004066 if (PyUnicode_READY(str) < 0)
4067 return NULL;
4068 kind = PyUnicode_KIND(str);
4069 data = PyUnicode_DATA(str);
4070 len = PyUnicode_GET_LENGTH(str);
4071
4072 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004073 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004074
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004075 /* It might be possible to tighten this worst case */
4076 allocated = 8 * len;
4077 if (allocated / 8 != len)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004078 return PyErr_NoMemory();
4079
Antoine Pitrou244651a2009-05-04 18:56:13 +00004080 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004081 if (v == NULL)
4082 return NULL;
4083
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004084 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004085 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004086 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004087
Antoine Pitrou244651a2009-05-04 18:56:13 +00004088 if (inShift) {
4089 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4090 /* shifting out */
4091 if (base64bits) { /* output remaining bits */
4092 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4093 base64buffer = 0;
4094 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004095 }
4096 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004097 /* Characters not in the BASE64 set implicitly unshift the sequence
4098 so no '-' is required, except if the character is itself a '-' */
4099 if (IS_BASE64(ch) || ch == '-') {
4100 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004101 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004102 *out++ = (char) ch;
4103 }
4104 else {
4105 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004106 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004107 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004108 else { /* not in a shift sequence */
4109 if (ch == '+') {
4110 *out++ = '+';
4111 *out++ = '-';
4112 }
4113 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4114 *out++ = (char) ch;
4115 }
4116 else {
4117 *out++ = '+';
4118 inShift = 1;
4119 goto encode_char;
4120 }
4121 }
4122 continue;
4123encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004124 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004125 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004126
Antoine Pitrou244651a2009-05-04 18:56:13 +00004127 /* code first surrogate */
4128 base64bits += 16;
4129 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4130 while (base64bits >= 6) {
4131 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4132 base64bits -= 6;
4133 }
4134 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004135 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004136 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004137 base64bits += 16;
4138 base64buffer = (base64buffer << 16) | ch;
4139 while (base64bits >= 6) {
4140 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4141 base64bits -= 6;
4142 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004143 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004144 if (base64bits)
4145 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4146 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004147 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004148 if (_PyBytes_Resize(&v, out - start) < 0)
4149 return NULL;
4150 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004151}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004152PyObject *
4153PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4154 Py_ssize_t size,
4155 int base64SetO,
4156 int base64WhiteSpace,
4157 const char *errors)
4158{
4159 PyObject *result;
4160 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4161 if (tmp == NULL)
4162 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004163 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004164 base64WhiteSpace, errors);
4165 Py_DECREF(tmp);
4166 return result;
4167}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004168
Antoine Pitrou244651a2009-05-04 18:56:13 +00004169#undef IS_BASE64
4170#undef FROM_BASE64
4171#undef TO_BASE64
4172#undef DECODE_DIRECT
4173#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004174
Guido van Rossumd57fd912000-03-10 22:53:23 +00004175/* --- UTF-8 Codec -------------------------------------------------------- */
4176
Tim Petersced69f82003-09-16 20:30:58 +00004177static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004178char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004179 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4180 illegal prefix. See RFC 3629 for details */
4181 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4182 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004183 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004184 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4185 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4186 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4187 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004188 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4189 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004190 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4191 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004192 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4193 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4194 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4195 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4196 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004197};
4198
Alexander Belopolsky40018472011-02-26 01:02:56 +00004199PyObject *
4200PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004201 Py_ssize_t size,
4202 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004203{
Walter Dörwald69652032004-09-07 20:24:22 +00004204 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4205}
4206
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004207#include "stringlib/ucs1lib.h"
4208#include "stringlib/codecs.h"
4209#include "stringlib/undef.h"
4210
4211#include "stringlib/ucs2lib.h"
4212#include "stringlib/codecs.h"
4213#include "stringlib/undef.h"
4214
4215#include "stringlib/ucs4lib.h"
4216#include "stringlib/codecs.h"
4217#include "stringlib/undef.h"
4218
Antoine Pitrouab868312009-01-10 15:40:25 +00004219/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4220#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4221
4222/* Mask to quickly check whether a C 'long' contains a
4223 non-ASCII, UTF8-encoded char. */
4224#if (SIZEOF_LONG == 8)
4225# define ASCII_CHAR_MASK 0x8080808080808080L
4226#elif (SIZEOF_LONG == 4)
4227# define ASCII_CHAR_MASK 0x80808080L
4228#else
4229# error C 'long' size should be either 4 or 8!
4230#endif
4231
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004232/* Scans a UTF-8 string and returns the maximum character to be expected
4233 and the size of the decoded unicode string.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004234
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004235 This function doesn't check for errors, these checks are performed in
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004236 PyUnicode_DecodeUTF8Stateful.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004237 */
4238static Py_UCS4
Victor Stinnera1d12bb2011-12-11 21:53:09 +01004239utf8_scanner(const unsigned char *p, Py_ssize_t string_size, Py_ssize_t *unicode_size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004240{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004241 Py_ssize_t char_count = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004242 const unsigned char *end = p + string_size;
4243 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004244
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004245 assert(unicode_size != NULL);
4246
4247 /* By having a cascade of independent loops which fallback onto each
4248 other, we minimize the amount of work done in the average loop
4249 iteration, and we also maximize the CPU's ability to predict
4250 branches correctly (because a given condition will have always the
4251 same boolean outcome except perhaps in the last iteration of the
4252 corresponding loop).
4253 In the general case this brings us rather close to decoding
4254 performance pre-PEP 393, despite the two-pass decoding.
4255
4256 Note that the pure ASCII loop is not duplicated once a non-ASCII
4257 character has been encountered. It is actually a pessimization (by
4258 a significant factor) to use this loop on text with many non-ASCII
4259 characters, and it is important to avoid bad performance on valid
4260 utf-8 data (invalid utf-8 being a different can of worms).
4261 */
4262
4263 /* ASCII */
4264 for (; p < end; ++p) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004265 /* Only check value if it's not a ASCII char... */
4266 if (*p < 0x80) {
4267 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4268 an explanation. */
4269 if (!((size_t) p & LONG_PTR_MASK)) {
4270 /* Help register allocation */
4271 register const unsigned char *_p = p;
4272 while (_p < aligned_end) {
4273 unsigned long value = *(unsigned long *) _p;
4274 if (value & ASCII_CHAR_MASK)
4275 break;
4276 _p += SIZEOF_LONG;
4277 char_count += SIZEOF_LONG;
4278 }
4279 p = _p;
4280 if (p == end)
4281 break;
4282 }
4283 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004284 if (*p < 0x80)
4285 ++char_count;
4286 else
4287 goto _ucs1loop;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004288 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004289 *unicode_size = char_count;
4290 return 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004291
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004292_ucs1loop:
4293 for (; p < end; ++p) {
4294 if (*p < 0xc4)
4295 char_count += ((*p & 0xc0) != 0x80);
4296 else
4297 goto _ucs2loop;
4298 }
4299 *unicode_size = char_count;
4300 return 255;
4301
4302_ucs2loop:
4303 for (; p < end; ++p) {
4304 if (*p < 0xf0)
4305 char_count += ((*p & 0xc0) != 0x80);
4306 else
4307 goto _ucs4loop;
4308 }
4309 *unicode_size = char_count;
4310 return 65535;
4311
4312_ucs4loop:
4313 for (; p < end; ++p) {
4314 char_count += ((*p & 0xc0) != 0x80);
4315 }
4316 *unicode_size = char_count;
4317 return 65537;
4318}
4319
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004320/* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
Victor Stinner785938e2011-12-11 20:09:03 +01004321 in case of errors. Implicit parameters: unicode, kind, data, onError.
4322 Potential resizing overallocates, so the result needs to shrink at the end.
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004323*/
Victor Stinner785938e2011-12-11 20:09:03 +01004324#define WRITE_MAYBE_FAIL(index, value) \
4325 do { \
4326 Py_ssize_t pos = index; \
4327 if (pos > PyUnicode_GET_LENGTH(unicode) && \
4328 unicode_resize(&unicode, pos + pos/8) < 0) \
4329 goto onError; \
4330 if (unicode_putchar(&unicode, &pos, value) < 0) \
4331 goto onError; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004332 } while (0)
4333
Alexander Belopolsky40018472011-02-26 01:02:56 +00004334PyObject *
Victor Stinner785938e2011-12-11 20:09:03 +01004335decode_utf8_errors(const char *starts,
4336 Py_ssize_t size,
4337 const char *errors,
4338 Py_ssize_t *consumed,
4339 const char *s,
4340 PyObject *unicode,
4341 Py_ssize_t i)
Walter Dörwald69652032004-09-07 20:24:22 +00004342{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004343 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004344 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004345 Py_ssize_t startinpos;
4346 Py_ssize_t endinpos;
Victor Stinner785938e2011-12-11 20:09:03 +01004347 const char *e = starts + size;
4348 const char *aligned_end;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004349 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004350 PyObject *errorHandler = NULL;
4351 PyObject *exc = NULL;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004352
Antoine Pitrouab868312009-01-10 15:40:25 +00004353 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004354
4355 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004356 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004357
4358 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004359 /* Fast path for runs of ASCII characters. Given that common UTF-8
4360 input will consist of an overwhelming majority of ASCII
4361 characters, we try to optimize for this case by checking
4362 as many characters as a C 'long' can contain.
4363 First, check if we can do an aligned read, as most CPUs have
4364 a penalty for unaligned reads.
4365 */
4366 if (!((size_t) s & LONG_PTR_MASK)) {
4367 /* Help register allocation */
4368 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004369 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004370 while (_s < aligned_end) {
4371 /* Read a whole long at a time (either 4 or 8 bytes),
4372 and do a fast unrolled copy if it only contains ASCII
4373 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004374 unsigned long value = *(unsigned long *) _s;
4375 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004376 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004377 WRITE_MAYBE_FAIL(_i+0, _s[0]);
4378 WRITE_MAYBE_FAIL(_i+1, _s[1]);
4379 WRITE_MAYBE_FAIL(_i+2, _s[2]);
4380 WRITE_MAYBE_FAIL(_i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004381#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004382 WRITE_MAYBE_FAIL(_i+4, _s[4]);
4383 WRITE_MAYBE_FAIL(_i+5, _s[5]);
4384 WRITE_MAYBE_FAIL(_i+6, _s[6]);
4385 WRITE_MAYBE_FAIL(_i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004386#endif
4387 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004388 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004389 }
4390 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004391 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004392 if (s == e)
4393 break;
4394 ch = (unsigned char)*s;
4395 }
4396 }
4397
4398 if (ch < 0x80) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004399 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004400 s++;
4401 continue;
4402 }
4403
4404 n = utf8_code_length[ch];
4405
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004406 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004407 if (consumed)
4408 break;
4409 else {
4410 errmsg = "unexpected end of data";
4411 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004412 endinpos = startinpos+1;
4413 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4414 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004415 goto utf8Error;
4416 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004417 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004418
4419 switch (n) {
4420
4421 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004422 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004423 startinpos = s-starts;
4424 endinpos = startinpos+1;
4425 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004426
4427 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004428 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004429 startinpos = s-starts;
4430 endinpos = startinpos+1;
4431 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004432
4433 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004434 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004435 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004436 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004437 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004438 goto utf8Error;
4439 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004440 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004441 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004442 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004443 break;
4444
4445 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004446 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4447 will result in surrogates in range d800-dfff. Surrogates are
4448 not valid UTF-8 so they are rejected.
4449 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4450 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004451 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004452 (s[2] & 0xc0) != 0x80 ||
4453 ((unsigned char)s[0] == 0xE0 &&
4454 (unsigned char)s[1] < 0xA0) ||
4455 ((unsigned char)s[0] == 0xED &&
4456 (unsigned char)s[1] > 0x9F)) {
4457 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004458 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004459 endinpos = startinpos + 1;
4460
4461 /* if s[1] first two bits are 1 and 0, then the invalid
4462 continuation byte is s[2], so increment endinpos by 1,
4463 if not, s[1] is invalid and endinpos doesn't need to
4464 be incremented. */
4465 if ((s[1] & 0xC0) == 0x80)
4466 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004467 goto utf8Error;
4468 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004469 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004470 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004471 WRITE_MAYBE_FAIL(i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004472 break;
4473
4474 case 4:
4475 if ((s[1] & 0xc0) != 0x80 ||
4476 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004477 (s[3] & 0xc0) != 0x80 ||
4478 ((unsigned char)s[0] == 0xF0 &&
4479 (unsigned char)s[1] < 0x90) ||
4480 ((unsigned char)s[0] == 0xF4 &&
4481 (unsigned char)s[1] > 0x8F)) {
4482 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004483 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004484 endinpos = startinpos + 1;
4485 if ((s[1] & 0xC0) == 0x80) {
4486 endinpos++;
4487 if ((s[2] & 0xC0) == 0x80)
4488 endinpos++;
4489 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004490 goto utf8Error;
4491 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004492 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004493 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01004494 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Ezio Melotti57221d02010-07-01 07:32:02 +00004495
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004496 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004497 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004498 }
4499 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004500 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004501
Benjamin Peterson29060642009-01-31 22:14:21 +00004502 utf8Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00004503 if (unicode_decode_call_errorhandler(
4504 errors, &errorHandler,
4505 "utf8", errmsg,
4506 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004507 &unicode, &i))
Benjamin Peterson29060642009-01-31 22:14:21 +00004508 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004509 /* Update data because unicode_decode_call_errorhandler might have
4510 re-created or resized the unicode object. */
Benjamin Peterson29060642009-01-31 22:14:21 +00004511 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004512 }
Walter Dörwald69652032004-09-07 20:24:22 +00004513 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004514 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004515
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004516 /* Adjust length and ready string when it contained errors and
4517 is of the old resizable kind. */
Victor Stinner785938e2011-12-11 20:09:03 +01004518 if (unicode_resize(&unicode, i) < 0)
4519 goto onError;
4520 unicode_adjust_maxchar(&unicode);
4521 if (unicode == NULL)
4522 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004523
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004524 Py_XDECREF(errorHandler);
4525 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004526 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01004527 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004528
Benjamin Peterson29060642009-01-31 22:14:21 +00004529 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004530 Py_XDECREF(errorHandler);
4531 Py_XDECREF(exc);
Victor Stinner785938e2011-12-11 20:09:03 +01004532 Py_XDECREF(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004533 return NULL;
4534}
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004535#undef WRITE_MAYBE_FAIL
Antoine Pitrouab868312009-01-10 15:40:25 +00004536
Victor Stinner785938e2011-12-11 20:09:03 +01004537PyObject *
4538PyUnicode_DecodeUTF8Stateful(const char *s,
4539 Py_ssize_t size,
4540 const char *errors,
4541 Py_ssize_t *consumed)
4542{
4543 Py_UCS4 maxchar = 0;
4544 Py_ssize_t unicode_size;
4545 int has_errors = 0;
4546 PyObject *unicode;
4547 int kind;
4548 void *data;
4549 const char *starts = s;
4550 const char *e;
4551 Py_ssize_t i;
4552
4553 if (size == 0) {
4554 if (consumed)
4555 *consumed = 0;
Victor Stinner382955f2011-12-11 21:44:00 +01004556 Py_INCREF(unicode_empty);
4557 return unicode_empty;
Victor Stinner785938e2011-12-11 20:09:03 +01004558 }
4559
Victor Stinnera1d12bb2011-12-11 21:53:09 +01004560 maxchar = utf8_scanner((const unsigned char *)s, size, &unicode_size);
Victor Stinner785938e2011-12-11 20:09:03 +01004561
4562 /* When the string is ASCII only, just use memcpy and return.
4563 unicode_size may be != size if there is an incomplete UTF-8
4564 sequence at the end of the ASCII block. */
4565 if (maxchar < 128 && size == unicode_size) {
4566 if (consumed)
4567 *consumed = size;
4568 return unicode_fromascii(s, size);
4569 }
4570
4571 unicode = PyUnicode_New(unicode_size, maxchar);
4572 if (!unicode)
4573 return NULL;
4574 kind = PyUnicode_KIND(unicode);
4575 data = PyUnicode_DATA(unicode);
4576
4577 /* Unpack UTF-8 encoded data */
4578 i = 0;
4579 e = starts + size;
4580 switch (kind) {
4581 case PyUnicode_1BYTE_KIND:
4582 has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
4583 break;
4584 case PyUnicode_2BYTE_KIND:
4585 has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
4586 break;
4587 case PyUnicode_4BYTE_KIND:
4588 has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
4589 break;
4590 }
4591 if (!has_errors) {
4592 /* Ensure the unicode size calculation was correct */
4593 assert(i == unicode_size);
4594 assert(s == e);
4595 if (consumed)
4596 *consumed = size;
4597 return unicode;
4598 }
4599
4600 /* In case of errors, maxchar and size computation might be incorrect;
4601 code below refits and resizes as necessary. */
4602 return decode_utf8_errors(starts, size, errors, consumed, s, unicode, i);
4603}
4604
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004605#ifdef __APPLE__
4606
4607/* Simplified UTF-8 decoder using surrogateescape error handler,
4608 used to decode the command line arguments on Mac OS X. */
4609
4610wchar_t*
4611_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4612{
4613 int n;
4614 const char *e;
4615 wchar_t *unicode, *p;
4616
4617 /* Note: size will always be longer than the resulting Unicode
4618 character count */
4619 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4620 PyErr_NoMemory();
4621 return NULL;
4622 }
4623 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4624 if (!unicode)
4625 return NULL;
4626
4627 /* Unpack UTF-8 encoded data */
4628 p = unicode;
4629 e = s + size;
4630 while (s < e) {
4631 Py_UCS4 ch = (unsigned char)*s;
4632
4633 if (ch < 0x80) {
4634 *p++ = (wchar_t)ch;
4635 s++;
4636 continue;
4637 }
4638
4639 n = utf8_code_length[ch];
4640 if (s + n > e) {
4641 goto surrogateescape;
4642 }
4643
4644 switch (n) {
4645 case 0:
4646 case 1:
4647 goto surrogateescape;
4648
4649 case 2:
4650 if ((s[1] & 0xc0) != 0x80)
4651 goto surrogateescape;
4652 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4653 assert ((ch > 0x007F) && (ch <= 0x07FF));
4654 *p++ = (wchar_t)ch;
4655 break;
4656
4657 case 3:
4658 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4659 will result in surrogates in range d800-dfff. Surrogates are
4660 not valid UTF-8 so they are rejected.
4661 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4662 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4663 if ((s[1] & 0xc0) != 0x80 ||
4664 (s[2] & 0xc0) != 0x80 ||
4665 ((unsigned char)s[0] == 0xE0 &&
4666 (unsigned char)s[1] < 0xA0) ||
4667 ((unsigned char)s[0] == 0xED &&
4668 (unsigned char)s[1] > 0x9F)) {
4669
4670 goto surrogateescape;
4671 }
4672 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4673 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004674 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004675 break;
4676
4677 case 4:
4678 if ((s[1] & 0xc0) != 0x80 ||
4679 (s[2] & 0xc0) != 0x80 ||
4680 (s[3] & 0xc0) != 0x80 ||
4681 ((unsigned char)s[0] == 0xF0 &&
4682 (unsigned char)s[1] < 0x90) ||
4683 ((unsigned char)s[0] == 0xF4 &&
4684 (unsigned char)s[1] > 0x8F)) {
4685 goto surrogateescape;
4686 }
4687 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4688 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01004689 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004690
4691#if SIZEOF_WCHAR_T == 4
4692 *p++ = (wchar_t)ch;
4693#else
4694 /* compute and append the two surrogates: */
Victor Stinner551ac952011-11-29 22:58:13 +01004695 *p++ = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4696 *p++ = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004697#endif
4698 break;
4699 }
4700 s += n;
4701 continue;
4702
4703 surrogateescape:
4704 *p++ = 0xDC00 + ch;
4705 s++;
4706 }
4707 *p = L'\0';
4708 return unicode;
4709}
4710
4711#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004712
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004713/* Primary internal function which creates utf8 encoded bytes objects.
4714
4715 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004716 and allocate exactly as much space needed at the end. Else allocate the
4717 maximum possible needed (4 result bytes per Unicode character), and return
4718 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004719*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004720PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004721_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004722{
Tim Peters602f7402002-04-27 18:03:26 +00004723#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004724
Guido van Rossum98297ee2007-11-06 21:34:58 +00004725 Py_ssize_t i; /* index into s of next input byte */
4726 PyObject *result; /* result string object */
4727 char *p; /* next free byte in output buffer */
4728 Py_ssize_t nallocated; /* number of result bytes allocated */
4729 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004730 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004731 PyObject *errorHandler = NULL;
4732 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004733 int kind;
4734 void *data;
4735 Py_ssize_t size;
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004736 PyObject *rep = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004737
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004738 if (!PyUnicode_Check(unicode)) {
4739 PyErr_BadArgument();
4740 return NULL;
4741 }
4742
4743 if (PyUnicode_READY(unicode) == -1)
4744 return NULL;
4745
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004746 if (PyUnicode_UTF8(unicode))
4747 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4748 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004749
4750 kind = PyUnicode_KIND(unicode);
4751 data = PyUnicode_DATA(unicode);
4752 size = PyUnicode_GET_LENGTH(unicode);
4753
Tim Peters602f7402002-04-27 18:03:26 +00004754 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004755
Tim Peters602f7402002-04-27 18:03:26 +00004756 if (size <= MAX_SHORT_UNICHARS) {
4757 /* Write into the stack buffer; nallocated can't overflow.
4758 * At the end, we'll allocate exactly as much heap space as it
4759 * turns out we need.
4760 */
4761 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004762 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004763 p = stackbuf;
4764 }
4765 else {
4766 /* Overallocate on the heap, and give the excess back at the end. */
4767 nallocated = size * 4;
4768 if (nallocated / 4 != size) /* overflow! */
4769 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004770 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004771 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004772 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004773 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004774 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004775
Tim Peters602f7402002-04-27 18:03:26 +00004776 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004777 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004778
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004779 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004780 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004781 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004782
Guido van Rossumd57fd912000-03-10 22:53:23 +00004783 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004784 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004785 *p++ = (char)(0xc0 | (ch >> 6));
4786 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner551ac952011-11-29 22:58:13 +01004787 } else if (Py_UNICODE_IS_SURROGATE(ch)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004788 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004789 Py_ssize_t repsize, k, startpos;
4790 startpos = i-1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004791 rep = unicode_encode_call_errorhandler(
4792 errors, &errorHandler, "utf-8", "surrogates not allowed",
Victor Stinner7931d9a2011-11-04 00:22:48 +01004793 unicode, &exc, startpos, startpos+1, &newpos);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004794 if (!rep)
4795 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004796
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004797 if (PyBytes_Check(rep))
4798 repsize = PyBytes_GET_SIZE(rep);
4799 else
Victor Stinner9e30aa52011-11-21 02:49:52 +01004800 repsize = PyUnicode_GET_LENGTH(rep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004801
4802 if (repsize > 4) {
4803 Py_ssize_t offset;
4804
4805 if (result == NULL)
4806 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004807 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004808 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004809
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004810 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4811 /* integer overflow */
4812 PyErr_NoMemory();
4813 goto error;
4814 }
4815 nallocated += repsize - 4;
4816 if (result != NULL) {
4817 if (_PyBytes_Resize(&result, nallocated) < 0)
4818 goto error;
4819 } else {
4820 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004821 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004822 goto error;
4823 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4824 }
4825 p = PyBytes_AS_STRING(result) + offset;
4826 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004827
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004828 if (PyBytes_Check(rep)) {
4829 char *prep = PyBytes_AS_STRING(rep);
4830 for(k = repsize; k > 0; k--)
4831 *p++ = *prep++;
4832 } else /* rep is unicode */ {
Victor Stinnera98b28c2011-11-10 20:21:49 +01004833 enum PyUnicode_Kind repkind;
4834 void *repdata;
4835
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004836 if (PyUnicode_READY(rep) < 0)
Victor Stinnera98b28c2011-11-10 20:21:49 +01004837 goto error;
Victor Stinnera98b28c2011-11-10 20:21:49 +01004838 repkind = PyUnicode_KIND(rep);
4839 repdata = PyUnicode_DATA(rep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004840
4841 for(k=0; k<repsize; k++) {
Victor Stinnera98b28c2011-11-10 20:21:49 +01004842 Py_UCS4 c = PyUnicode_READ(repkind, repdata, k);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004843 if (0x80 <= c) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01004844 raise_encode_exception(&exc, "utf-8",
Victor Stinner7931d9a2011-11-04 00:22:48 +01004845 unicode,
Martin v. Löwis9e816682011-11-02 12:45:42 +01004846 i-1, i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004847 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004848 goto error;
4849 }
Victor Stinnera98b28c2011-11-10 20:21:49 +01004850 *p++ = (char)c;
Victor Stinner31be90b2010-04-22 19:38:16 +00004851 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004852 }
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004853 Py_CLEAR(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004854 } else if (ch < 0x10000) {
4855 *p++ = (char)(0xe0 | (ch >> 12));
4856 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4857 *p++ = (char)(0x80 | (ch & 0x3f));
4858 } else /* ch >= 0x10000 */ {
Victor Stinner8faf8212011-12-08 22:14:11 +01004859 assert(ch <= MAX_UNICODE);
Tim Peters602f7402002-04-27 18:03:26 +00004860 /* Encode UCS4 Unicode ordinals */
4861 *p++ = (char)(0xf0 | (ch >> 18));
4862 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4863 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4864 *p++ = (char)(0x80 | (ch & 0x3f));
4865 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004866 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004867
Guido van Rossum98297ee2007-11-06 21:34:58 +00004868 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004869 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004870 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004871 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004872 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004873 }
4874 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004875 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004876 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004877 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004878 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004879 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004880
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004881 Py_XDECREF(errorHandler);
4882 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004883 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004884 error:
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004885 Py_XDECREF(rep);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004886 Py_XDECREF(errorHandler);
4887 Py_XDECREF(exc);
4888 Py_XDECREF(result);
4889 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004890
Tim Peters602f7402002-04-27 18:03:26 +00004891#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004892}
4893
Alexander Belopolsky40018472011-02-26 01:02:56 +00004894PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004895PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4896 Py_ssize_t size,
4897 const char *errors)
4898{
4899 PyObject *v, *unicode;
4900
4901 unicode = PyUnicode_FromUnicode(s, size);
4902 if (unicode == NULL)
4903 return NULL;
4904 v = _PyUnicode_AsUTF8String(unicode, errors);
4905 Py_DECREF(unicode);
4906 return v;
4907}
4908
4909PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004910PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004911{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004912 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004913}
4914
Walter Dörwald41980ca2007-08-16 21:55:45 +00004915/* --- UTF-32 Codec ------------------------------------------------------- */
4916
4917PyObject *
4918PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004919 Py_ssize_t size,
4920 const char *errors,
4921 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004922{
4923 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4924}
4925
4926PyObject *
4927PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004928 Py_ssize_t size,
4929 const char *errors,
4930 int *byteorder,
4931 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004932{
4933 const char *starts = s;
4934 Py_ssize_t startinpos;
4935 Py_ssize_t endinpos;
4936 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004937 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004938 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004939 int bo = 0; /* assume native ordering by default */
4940 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004941 /* Offsets from q for retrieving bytes in the right order. */
4942#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4943 int iorder[] = {0, 1, 2, 3};
4944#else
4945 int iorder[] = {3, 2, 1, 0};
4946#endif
4947 PyObject *errorHandler = NULL;
4948 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004949
Walter Dörwald41980ca2007-08-16 21:55:45 +00004950 q = (unsigned char *)s;
4951 e = q + size;
4952
4953 if (byteorder)
4954 bo = *byteorder;
4955
4956 /* Check for BOM marks (U+FEFF) in the input and adjust current
4957 byte order setting accordingly. In native mode, the leading BOM
4958 mark is skipped, in all other modes, it is copied to the output
4959 stream as-is (giving a ZWNBSP character). */
4960 if (bo == 0) {
4961 if (size >= 4) {
4962 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004963 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004964#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004965 if (bom == 0x0000FEFF) {
4966 q += 4;
4967 bo = -1;
4968 }
4969 else if (bom == 0xFFFE0000) {
4970 q += 4;
4971 bo = 1;
4972 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004973#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004974 if (bom == 0x0000FEFF) {
4975 q += 4;
4976 bo = 1;
4977 }
4978 else if (bom == 0xFFFE0000) {
4979 q += 4;
4980 bo = -1;
4981 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004982#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004983 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004984 }
4985
4986 if (bo == -1) {
4987 /* force LE */
4988 iorder[0] = 0;
4989 iorder[1] = 1;
4990 iorder[2] = 2;
4991 iorder[3] = 3;
4992 }
4993 else if (bo == 1) {
4994 /* force BE */
4995 iorder[0] = 3;
4996 iorder[1] = 2;
4997 iorder[2] = 1;
4998 iorder[3] = 0;
4999 }
5000
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005001 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005002 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005003 if (!unicode)
5004 return NULL;
5005 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005006 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005007 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005008
Walter Dörwald41980ca2007-08-16 21:55:45 +00005009 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005010 Py_UCS4 ch;
5011 /* remaining bytes at the end? (size should be divisible by 4) */
5012 if (e-q<4) {
5013 if (consumed)
5014 break;
5015 errmsg = "truncated data";
5016 startinpos = ((const char *)q)-starts;
5017 endinpos = ((const char *)e)-starts;
5018 goto utf32Error;
5019 /* The remaining input chars are ignored if the callback
5020 chooses to skip the input */
5021 }
5022 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5023 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005024
Benjamin Peterson29060642009-01-31 22:14:21 +00005025 if (ch >= 0x110000)
5026 {
5027 errmsg = "codepoint not in range(0x110000)";
5028 startinpos = ((const char *)q)-starts;
5029 endinpos = startinpos+4;
5030 goto utf32Error;
5031 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005032 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5033 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005034 q += 4;
5035 continue;
5036 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005037 if (unicode_decode_call_errorhandler(
5038 errors, &errorHandler,
5039 "utf32", errmsg,
5040 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005041 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005042 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005043 }
5044
5045 if (byteorder)
5046 *byteorder = bo;
5047
5048 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005049 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005050
5051 /* Adjust length */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005052 if (PyUnicode_Resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005053 goto onError;
5054
5055 Py_XDECREF(errorHandler);
5056 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005057 return unicode_result(unicode);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005058
Benjamin Peterson29060642009-01-31 22:14:21 +00005059 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005060 Py_DECREF(unicode);
5061 Py_XDECREF(errorHandler);
5062 Py_XDECREF(exc);
5063 return NULL;
5064}
5065
5066PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005067_PyUnicode_EncodeUTF32(PyObject *str,
5068 const char *errors,
5069 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005070{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005071 int kind;
5072 void *data;
5073 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005074 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005075 unsigned char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005076 Py_ssize_t nsize, bytesize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005077 /* Offsets from p for storing byte pairs in the right order. */
5078#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5079 int iorder[] = {0, 1, 2, 3};
5080#else
5081 int iorder[] = {3, 2, 1, 0};
5082#endif
5083
Benjamin Peterson29060642009-01-31 22:14:21 +00005084#define STORECHAR(CH) \
5085 do { \
5086 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5087 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5088 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5089 p[iorder[0]] = (CH) & 0xff; \
5090 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005091 } while(0)
5092
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005093 if (!PyUnicode_Check(str)) {
5094 PyErr_BadArgument();
5095 return NULL;
5096 }
5097 if (PyUnicode_READY(str) < 0)
5098 return NULL;
5099 kind = PyUnicode_KIND(str);
5100 data = PyUnicode_DATA(str);
5101 len = PyUnicode_GET_LENGTH(str);
5102
5103 nsize = len + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005104 bytesize = nsize * 4;
5105 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005106 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005107 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005108 if (v == NULL)
5109 return NULL;
5110
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005111 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005112 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005113 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005114 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005115 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005116
5117 if (byteorder == -1) {
5118 /* force LE */
5119 iorder[0] = 0;
5120 iorder[1] = 1;
5121 iorder[2] = 2;
5122 iorder[3] = 3;
5123 }
5124 else if (byteorder == 1) {
5125 /* force BE */
5126 iorder[0] = 3;
5127 iorder[1] = 2;
5128 iorder[2] = 1;
5129 iorder[3] = 0;
5130 }
5131
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005132 for (i = 0; i < len; i++)
5133 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005134
5135 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005136 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005137#undef STORECHAR
5138}
5139
Alexander Belopolsky40018472011-02-26 01:02:56 +00005140PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005141PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5142 Py_ssize_t size,
5143 const char *errors,
5144 int byteorder)
5145{
5146 PyObject *result;
5147 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5148 if (tmp == NULL)
5149 return NULL;
5150 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5151 Py_DECREF(tmp);
5152 return result;
5153}
5154
5155PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005156PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005157{
Victor Stinnerb960b342011-11-20 19:12:52 +01005158 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005159}
5160
Guido van Rossumd57fd912000-03-10 22:53:23 +00005161/* --- UTF-16 Codec ------------------------------------------------------- */
5162
Tim Peters772747b2001-08-09 22:21:55 +00005163PyObject *
5164PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005165 Py_ssize_t size,
5166 const char *errors,
5167 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005168{
Walter Dörwald69652032004-09-07 20:24:22 +00005169 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5170}
5171
Antoine Pitrouab868312009-01-10 15:40:25 +00005172/* Two masks for fast checking of whether a C 'long' may contain
5173 UTF16-encoded surrogate characters. This is an efficient heuristic,
5174 assuming that non-surrogate characters with a code point >= 0x8000 are
5175 rare in most input.
5176 FAST_CHAR_MASK is used when the input is in native byte ordering,
5177 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005178*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005179#if (SIZEOF_LONG == 8)
5180# define FAST_CHAR_MASK 0x8000800080008000L
5181# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
5182#elif (SIZEOF_LONG == 4)
5183# define FAST_CHAR_MASK 0x80008000L
5184# define SWAPPED_FAST_CHAR_MASK 0x00800080L
5185#else
5186# error C 'long' size should be either 4 or 8!
5187#endif
5188
Walter Dörwald69652032004-09-07 20:24:22 +00005189PyObject *
5190PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005191 Py_ssize_t size,
5192 const char *errors,
5193 int *byteorder,
5194 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005195{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005196 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005197 Py_ssize_t startinpos;
5198 Py_ssize_t endinpos;
5199 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005200 PyObject *unicode;
Antoine Pitrouab868312009-01-10 15:40:25 +00005201 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005202 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005203 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005204 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005205 /* Offsets from q for retrieving byte pairs in the right order. */
5206#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5207 int ihi = 1, ilo = 0;
5208#else
5209 int ihi = 0, ilo = 1;
5210#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005211 PyObject *errorHandler = NULL;
5212 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005213
5214 /* Note: size will always be longer than the resulting Unicode
5215 character count */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005216 unicode = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005217 if (!unicode)
5218 return NULL;
5219 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005220 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005221 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005222
Tim Peters772747b2001-08-09 22:21:55 +00005223 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005224 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005225
5226 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005227 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005228
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005229 /* Check for BOM marks (U+FEFF) in the input and adjust current
5230 byte order setting accordingly. In native mode, the leading BOM
5231 mark is skipped, in all other modes, it is copied to the output
5232 stream as-is (giving a ZWNBSP character). */
5233 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005234 if (size >= 2) {
Victor Stinner24729f32011-11-10 20:31:37 +01005235 const Py_UCS4 bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005236#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005237 if (bom == 0xFEFF) {
5238 q += 2;
5239 bo = -1;
5240 }
5241 else if (bom == 0xFFFE) {
5242 q += 2;
5243 bo = 1;
5244 }
Tim Petersced69f82003-09-16 20:30:58 +00005245#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005246 if (bom == 0xFEFF) {
5247 q += 2;
5248 bo = 1;
5249 }
5250 else if (bom == 0xFFFE) {
5251 q += 2;
5252 bo = -1;
5253 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005254#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005255 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005256 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005257
Tim Peters772747b2001-08-09 22:21:55 +00005258 if (bo == -1) {
5259 /* force LE */
5260 ihi = 1;
5261 ilo = 0;
5262 }
5263 else if (bo == 1) {
5264 /* force BE */
5265 ihi = 0;
5266 ilo = 1;
5267 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005268#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5269 native_ordering = ilo < ihi;
5270#else
5271 native_ordering = ilo > ihi;
5272#endif
Tim Peters772747b2001-08-09 22:21:55 +00005273
Antoine Pitrouab868312009-01-10 15:40:25 +00005274 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005275 while (q < e) {
Victor Stinner24729f32011-11-10 20:31:37 +01005276 Py_UCS4 ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005277 /* First check for possible aligned read of a C 'long'. Unaligned
5278 reads are more expensive, better to defer to another iteration. */
5279 if (!((size_t) q & LONG_PTR_MASK)) {
5280 /* Fast path for runs of non-surrogate chars. */
5281 register const unsigned char *_q = q;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005282 int kind = PyUnicode_KIND(unicode);
5283 void *data = PyUnicode_DATA(unicode);
5284 while (_q < aligned_end) {
5285 unsigned long block = * (unsigned long *) _q;
5286 unsigned short *pblock = (unsigned short*)&block;
5287 Py_UCS4 maxch;
5288 if (native_ordering) {
5289 /* Can use buffer directly */
5290 if (block & FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005291 break;
Antoine Pitrouab868312009-01-10 15:40:25 +00005292 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005293 else {
5294 /* Need to byte-swap */
5295 unsigned char *_p = (unsigned char*)pblock;
5296 if (block & SWAPPED_FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005297 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005298 _p[0] = _q[1];
5299 _p[1] = _q[0];
5300 _p[2] = _q[3];
5301 _p[3] = _q[2];
Antoine Pitrouab868312009-01-10 15:40:25 +00005302#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005303 _p[4] = _q[5];
5304 _p[5] = _q[4];
5305 _p[6] = _q[7];
5306 _p[7] = _q[6];
Antoine Pitrouab868312009-01-10 15:40:25 +00005307#endif
Antoine Pitrouab868312009-01-10 15:40:25 +00005308 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005309 maxch = Py_MAX(pblock[0], pblock[1]);
5310#if SIZEOF_LONG == 8
5311 maxch = Py_MAX(maxch, Py_MAX(pblock[2], pblock[3]));
5312#endif
5313 if (maxch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
5314 if (unicode_widen(&unicode, maxch) < 0)
5315 goto onError;
5316 kind = PyUnicode_KIND(unicode);
5317 data = PyUnicode_DATA(unicode);
5318 }
5319 PyUnicode_WRITE(kind, data, outpos++, pblock[0]);
5320 PyUnicode_WRITE(kind, data, outpos++, pblock[1]);
5321#if SIZEOF_LONG == 8
5322 PyUnicode_WRITE(kind, data, outpos++, pblock[2]);
5323 PyUnicode_WRITE(kind, data, outpos++, pblock[3]);
5324#endif
5325 _q += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00005326 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005327 q = _q;
5328 if (q >= e)
5329 break;
5330 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005331 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005332
Benjamin Peterson14339b62009-01-31 16:36:08 +00005333 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005334
Victor Stinner551ac952011-11-29 22:58:13 +01005335 if (!Py_UNICODE_IS_SURROGATE(ch)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005336 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5337 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005338 continue;
5339 }
5340
5341 /* UTF-16 code pair: */
5342 if (q > e) {
5343 errmsg = "unexpected end of data";
5344 startinpos = (((const char *)q) - 2) - starts;
5345 endinpos = ((const char *)e) + 1 - starts;
5346 goto utf16Error;
5347 }
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005348 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)) {
5349 Py_UCS4 ch2 = (q[ihi] << 8) | q[ilo];
Benjamin Peterson29060642009-01-31 22:14:21 +00005350 q += 2;
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005351 if (Py_UNICODE_IS_LOW_SURROGATE(ch2)) {
Victor Stinner62aa4d02011-11-09 00:03:45 +01005352 if (unicode_putchar(&unicode, &outpos,
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005353 Py_UNICODE_JOIN_SURROGATES(ch, ch2)) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005354 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005355 continue;
5356 }
5357 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005358 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005359 startinpos = (((const char *)q)-4)-starts;
5360 endinpos = startinpos+2;
5361 goto utf16Error;
5362 }
5363
Benjamin Peterson14339b62009-01-31 16:36:08 +00005364 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005365 errmsg = "illegal encoding";
5366 startinpos = (((const char *)q)-2)-starts;
5367 endinpos = startinpos+2;
5368 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005369
Benjamin Peterson29060642009-01-31 22:14:21 +00005370 utf16Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005371 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005372 errors,
5373 &errorHandler,
5374 "utf16", errmsg,
5375 &starts,
5376 (const char **)&e,
5377 &startinpos,
5378 &endinpos,
5379 &exc,
5380 (const char **)&q,
5381 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005382 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005383 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005384 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005385 /* remaining byte at the end? (size should be even) */
5386 if (e == q) {
5387 if (!consumed) {
5388 errmsg = "truncated data";
5389 startinpos = ((const char *)q) - starts;
5390 endinpos = ((const char *)e) + 1 - starts;
Antoine Pitrouab868312009-01-10 15:40:25 +00005391 if (unicode_decode_call_errorhandler(
5392 errors,
5393 &errorHandler,
5394 "utf16", errmsg,
5395 &starts,
5396 (const char **)&e,
5397 &startinpos,
5398 &endinpos,
5399 &exc,
5400 (const char **)&q,
5401 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005402 &outpos))
Antoine Pitrouab868312009-01-10 15:40:25 +00005403 goto onError;
5404 /* The remaining input chars are ignored if the callback
5405 chooses to skip the input */
5406 }
5407 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005408
5409 if (byteorder)
5410 *byteorder = bo;
5411
Walter Dörwald69652032004-09-07 20:24:22 +00005412 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005413 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005414
Guido van Rossumd57fd912000-03-10 22:53:23 +00005415 /* Adjust length */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005416 if (PyUnicode_Resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005417 goto onError;
5418
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005419 Py_XDECREF(errorHandler);
5420 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005421 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005422
Benjamin Peterson29060642009-01-31 22:14:21 +00005423 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005424 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005425 Py_XDECREF(errorHandler);
5426 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005427 return NULL;
5428}
5429
Antoine Pitrouab868312009-01-10 15:40:25 +00005430#undef FAST_CHAR_MASK
5431#undef SWAPPED_FAST_CHAR_MASK
5432
Tim Peters772747b2001-08-09 22:21:55 +00005433PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005434_PyUnicode_EncodeUTF16(PyObject *str,
5435 const char *errors,
5436 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005437{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005438 int kind;
5439 void *data;
5440 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005441 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005442 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005443 Py_ssize_t nsize, bytesize;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005444 Py_ssize_t i, pairs;
Tim Peters772747b2001-08-09 22:21:55 +00005445 /* Offsets from p for storing byte pairs in the right order. */
5446#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5447 int ihi = 1, ilo = 0;
5448#else
5449 int ihi = 0, ilo = 1;
5450#endif
5451
Benjamin Peterson29060642009-01-31 22:14:21 +00005452#define STORECHAR(CH) \
5453 do { \
5454 p[ihi] = ((CH) >> 8) & 0xff; \
5455 p[ilo] = (CH) & 0xff; \
5456 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005457 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005458
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005459 if (!PyUnicode_Check(str)) {
5460 PyErr_BadArgument();
5461 return NULL;
5462 }
5463 if (PyUnicode_READY(str) < 0)
5464 return NULL;
5465 kind = PyUnicode_KIND(str);
5466 data = PyUnicode_DATA(str);
5467 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005468
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005469 pairs = 0;
5470 if (kind == PyUnicode_4BYTE_KIND)
5471 for (i = 0; i < len; i++)
5472 if (PyUnicode_READ(kind, data, i) >= 0x10000)
5473 pairs++;
5474 /* 2 * (len + pairs + (byteorder == 0)) */
5475 if (len > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005476 return PyErr_NoMemory();
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005477 nsize = len + pairs + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005478 bytesize = nsize * 2;
5479 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005480 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005481 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005482 if (v == NULL)
5483 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005484
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005485 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005486 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005487 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005488 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005489 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005490
5491 if (byteorder == -1) {
5492 /* force LE */
5493 ihi = 1;
5494 ilo = 0;
5495 }
5496 else if (byteorder == 1) {
5497 /* force BE */
5498 ihi = 0;
5499 ilo = 1;
5500 }
5501
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005502 for (i = 0; i < len; i++) {
5503 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5504 Py_UCS4 ch2 = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +00005505 if (ch >= 0x10000) {
Victor Stinner551ac952011-11-29 22:58:13 +01005506 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
5507 ch = Py_UNICODE_HIGH_SURROGATE(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00005508 }
Tim Peters772747b2001-08-09 22:21:55 +00005509 STORECHAR(ch);
5510 if (ch2)
5511 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005512 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005513
5514 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005515 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005516#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005517}
5518
Alexander Belopolsky40018472011-02-26 01:02:56 +00005519PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005520PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5521 Py_ssize_t size,
5522 const char *errors,
5523 int byteorder)
5524{
5525 PyObject *result;
5526 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5527 if (tmp == NULL)
5528 return NULL;
5529 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5530 Py_DECREF(tmp);
5531 return result;
5532}
5533
5534PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005535PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005536{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005537 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005538}
5539
5540/* --- Unicode Escape Codec ----------------------------------------------- */
5541
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005542/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5543 if all the escapes in the string make it still a valid ASCII string.
5544 Returns -1 if any escapes were found which cause the string to
5545 pop out of ASCII range. Otherwise returns the length of the
5546 required buffer to hold the string.
5547 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005548static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005549length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5550{
5551 const unsigned char *p = (const unsigned char *)s;
5552 const unsigned char *end = p + size;
5553 Py_ssize_t length = 0;
5554
5555 if (size < 0)
5556 return -1;
5557
5558 for (; p < end; ++p) {
5559 if (*p > 127) {
5560 /* Non-ASCII */
5561 return -1;
5562 }
5563 else if (*p != '\\') {
5564 /* Normal character */
5565 ++length;
5566 }
5567 else {
5568 /* Backslash-escape, check next char */
5569 ++p;
5570 /* Escape sequence reaches till end of string or
5571 non-ASCII follow-up. */
5572 if (p >= end || *p > 127)
5573 return -1;
5574 switch (*p) {
5575 case '\n':
5576 /* backslash + \n result in zero characters */
5577 break;
5578 case '\\': case '\'': case '\"':
5579 case 'b': case 'f': case 't':
5580 case 'n': case 'r': case 'v': case 'a':
5581 ++length;
5582 break;
5583 case '0': case '1': case '2': case '3':
5584 case '4': case '5': case '6': case '7':
5585 case 'x': case 'u': case 'U': case 'N':
5586 /* these do not guarantee ASCII characters */
5587 return -1;
5588 default:
5589 /* count the backslash + the other character */
5590 length += 2;
5591 }
5592 }
5593 }
5594 return length;
5595}
5596
Fredrik Lundh06d12682001-01-24 07:59:11 +00005597static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005598
Alexander Belopolsky40018472011-02-26 01:02:56 +00005599PyObject *
5600PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005601 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005602 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005603{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005604 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005605 Py_ssize_t startinpos;
5606 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005607 int j;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005608 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005609 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005610 char* message;
5611 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005612 PyObject *errorHandler = NULL;
5613 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005614 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005615 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005616
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005617 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005618
5619 /* After length_of_escaped_ascii_string() there are two alternatives,
5620 either the string is pure ASCII with named escapes like \n, etc.
5621 and we determined it's exact size (common case)
5622 or it contains \x, \u, ... escape sequences. then we create a
5623 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005624 if (len >= 0) {
5625 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005626 if (!v)
5627 goto onError;
5628 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005629 }
5630 else {
5631 /* Escaped strings will always be longer than the resulting
5632 Unicode string, so we start with size here and then reduce the
5633 length after conversion to the true value.
5634 (but if the error callback returns a long replacement string
5635 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005636 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005637 if (!v)
5638 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005639 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005640 }
5641
Guido van Rossumd57fd912000-03-10 22:53:23 +00005642 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005643 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005644 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005645 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005646
Guido van Rossumd57fd912000-03-10 22:53:23 +00005647 while (s < end) {
5648 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005649 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005650 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005651
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005652 /* The only case in which i == ascii_length is a backslash
5653 followed by a newline. */
5654 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005655
Guido van Rossumd57fd912000-03-10 22:53:23 +00005656 /* Non-escape characters are interpreted as Unicode ordinals */
5657 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005658 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5659 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005660 continue;
5661 }
5662
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005663 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005664 /* \ - Escapes */
5665 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005666 c = *s++;
5667 if (s > end)
5668 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005669
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005670 /* The only case in which i == ascii_length is a backslash
5671 followed by a newline. */
5672 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005673
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005674 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005675
Benjamin Peterson29060642009-01-31 22:14:21 +00005676 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005677#define WRITECHAR(ch) \
5678 do { \
5679 if (unicode_putchar(&v, &i, ch) < 0) \
5680 goto onError; \
5681 }while(0)
5682
Guido van Rossumd57fd912000-03-10 22:53:23 +00005683 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005684 case '\\': WRITECHAR('\\'); break;
5685 case '\'': WRITECHAR('\''); break;
5686 case '\"': WRITECHAR('\"'); break;
5687 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005688 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005689 case 'f': WRITECHAR('\014'); break;
5690 case 't': WRITECHAR('\t'); break;
5691 case 'n': WRITECHAR('\n'); break;
5692 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005693 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005694 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005695 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005696 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005697
Benjamin Peterson29060642009-01-31 22:14:21 +00005698 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699 case '0': case '1': case '2': case '3':
5700 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005701 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005702 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005703 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005704 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005705 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005706 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005707 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005708 break;
5709
Benjamin Peterson29060642009-01-31 22:14:21 +00005710 /* hex escapes */
5711 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005712 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005713 digits = 2;
5714 message = "truncated \\xXX escape";
5715 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005716
Benjamin Peterson29060642009-01-31 22:14:21 +00005717 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005718 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005719 digits = 4;
5720 message = "truncated \\uXXXX escape";
5721 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005722
Benjamin Peterson29060642009-01-31 22:14:21 +00005723 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005724 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005725 digits = 8;
5726 message = "truncated \\UXXXXXXXX escape";
5727 hexescape:
5728 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005729 if (s+digits>end) {
5730 endinpos = size;
5731 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005732 errors, &errorHandler,
5733 "unicodeescape", "end of string in escape sequence",
5734 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005735 &v, &i))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005736 goto onError;
5737 goto nextByte;
5738 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005739 for (j = 0; j < digits; ++j) {
5740 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005741 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005742 endinpos = (s+j+1)-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005743 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005744 errors, &errorHandler,
5745 "unicodeescape", message,
5746 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005747 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005748 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005749 len = PyUnicode_GET_LENGTH(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005750 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005751 }
5752 chr = (chr<<4) & ~0xF;
5753 if (c >= '0' && c <= '9')
5754 chr += c - '0';
5755 else if (c >= 'a' && c <= 'f')
5756 chr += 10 + c - 'a';
5757 else
5758 chr += 10 + c - 'A';
5759 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005760 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005761 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005762 /* _decoding_error will have already written into the
5763 target buffer. */
5764 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005765 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005766 /* when we get here, chr is a 32-bit unicode character */
Victor Stinner8faf8212011-12-08 22:14:11 +01005767 if (chr <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005768 WRITECHAR(chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005769 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005770 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005771 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005772 errors, &errorHandler,
5773 "unicodeescape", "illegal Unicode character",
5774 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005775 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005776 goto onError;
5777 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005778 break;
5779
Benjamin Peterson29060642009-01-31 22:14:21 +00005780 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005781 case 'N':
5782 message = "malformed \\N character escape";
5783 if (ucnhash_CAPI == NULL) {
5784 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005785 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5786 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005787 if (ucnhash_CAPI == NULL)
5788 goto ucnhashError;
5789 }
5790 if (*s == '{') {
5791 const char *start = s+1;
5792 /* look for the closing brace */
5793 while (*s != '}' && s < end)
5794 s++;
5795 if (s > start && s < end && *s == '}') {
5796 /* found a name. look it up in the unicode database */
5797 message = "unknown Unicode character name";
5798 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005799 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005800 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005801 goto store;
5802 }
5803 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005804 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005805 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005806 errors, &errorHandler,
5807 "unicodeescape", message,
5808 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005809 &v, &i))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005810 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005811 break;
5812
5813 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005814 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005815 message = "\\ at end of string";
5816 s--;
5817 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005818 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005819 errors, &errorHandler,
5820 "unicodeescape", message,
5821 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005822 &v, &i))
Walter Dörwald8c077222002-03-25 11:16:18 +00005823 goto onError;
5824 }
5825 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005826 WRITECHAR('\\');
5827 WRITECHAR(s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005828 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005829 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005830 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005831 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005832 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005833 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005834#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005835
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005836 if (PyUnicode_Resize(&v, i) < 0)
5837 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005838 Py_XDECREF(errorHandler);
5839 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005840 return unicode_result(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005841
Benjamin Peterson29060642009-01-31 22:14:21 +00005842 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005843 PyErr_SetString(
5844 PyExc_UnicodeError,
5845 "\\N escapes not supported (can't load unicodedata module)"
5846 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005847 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005848 Py_XDECREF(errorHandler);
5849 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005850 return NULL;
5851
Benjamin Peterson29060642009-01-31 22:14:21 +00005852 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005853 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005854 Py_XDECREF(errorHandler);
5855 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005856 return NULL;
5857}
5858
5859/* Return a Unicode-Escape string version of the Unicode object.
5860
5861 If quotes is true, the string is enclosed in u"" or u'' quotes as
5862 appropriate.
5863
5864*/
5865
Alexander Belopolsky40018472011-02-26 01:02:56 +00005866PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005867PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005868{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005869 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005870 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005871 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005872 int kind;
5873 void *data;
5874 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005875
Thomas Wouters89f507f2006-12-13 04:49:30 +00005876 /* Initial allocation is based on the longest-possible unichr
5877 escape.
5878
5879 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5880 unichr, so in this case it's the longest unichr escape. In
5881 narrow (UTF-16) builds this is five chars per source unichr
5882 since there are two unichrs in the surrogate pair, so in narrow
5883 (UTF-16) builds it's not the longest unichr escape.
5884
5885 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5886 so in the narrow (UTF-16) build case it's the longest unichr
5887 escape.
5888 */
5889
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005890 if (!PyUnicode_Check(unicode)) {
5891 PyErr_BadArgument();
5892 return NULL;
5893 }
5894 if (PyUnicode_READY(unicode) < 0)
5895 return NULL;
5896 len = PyUnicode_GET_LENGTH(unicode);
5897 kind = PyUnicode_KIND(unicode);
5898 data = PyUnicode_DATA(unicode);
5899 switch(kind) {
5900 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5901 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5902 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5903 }
5904
5905 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005906 return PyBytes_FromStringAndSize(NULL, 0);
5907
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005908 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005909 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005910
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005911 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005912 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005913 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005914 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005915 if (repr == NULL)
5916 return NULL;
5917
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005918 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005919
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005920 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005921 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005922
Walter Dörwald79e913e2007-05-12 11:08:06 +00005923 /* Escape backslashes */
5924 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005925 *p++ = '\\';
5926 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005927 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005928 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005929
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005930 /* Map 21-bit characters to '\U00xxxxxx' */
5931 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005932 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005933 *p++ = '\\';
5934 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005935 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5936 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5937 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5938 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5939 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5940 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5941 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5942 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005943 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005944 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005945
Guido van Rossumd57fd912000-03-10 22:53:23 +00005946 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005947 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005948 *p++ = '\\';
5949 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005950 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5951 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5952 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5953 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005954 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005955
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005956 /* Map special whitespace to '\t', \n', '\r' */
5957 else if (ch == '\t') {
5958 *p++ = '\\';
5959 *p++ = 't';
5960 }
5961 else if (ch == '\n') {
5962 *p++ = '\\';
5963 *p++ = 'n';
5964 }
5965 else if (ch == '\r') {
5966 *p++ = '\\';
5967 *p++ = 'r';
5968 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005969
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005970 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005971 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005972 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005973 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005974 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5975 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005976 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005977
Guido van Rossumd57fd912000-03-10 22:53:23 +00005978 /* Copy everything else as-is */
5979 else
5980 *p++ = (char) ch;
5981 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005982
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005983 assert(p - PyBytes_AS_STRING(repr) > 0);
5984 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5985 return NULL;
5986 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005987}
5988
Alexander Belopolsky40018472011-02-26 01:02:56 +00005989PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005990PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5991 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005992{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005993 PyObject *result;
5994 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5995 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005996 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005997 result = PyUnicode_AsUnicodeEscapeString(tmp);
5998 Py_DECREF(tmp);
5999 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006000}
6001
6002/* --- Raw Unicode Escape Codec ------------------------------------------- */
6003
Alexander Belopolsky40018472011-02-26 01:02:56 +00006004PyObject *
6005PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006006 Py_ssize_t size,
6007 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006008{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006009 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006010 Py_ssize_t startinpos;
6011 Py_ssize_t endinpos;
6012 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006013 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006014 const char *end;
6015 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006016 PyObject *errorHandler = NULL;
6017 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006018
Guido van Rossumd57fd912000-03-10 22:53:23 +00006019 /* Escaped strings will always be longer than the resulting
6020 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006021 length after conversion to the true value. (But decoding error
6022 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006023 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006024 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006025 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006026 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006027 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006028 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006029 end = s + size;
6030 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006031 unsigned char c;
6032 Py_UCS4 x;
6033 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006034 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006035
Benjamin Peterson29060642009-01-31 22:14:21 +00006036 /* Non-escape characters are interpreted as Unicode ordinals */
6037 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006038 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6039 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006040 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006041 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006042 startinpos = s-starts;
6043
6044 /* \u-escapes are only interpreted iff the number of leading
6045 backslashes if odd */
6046 bs = s;
6047 for (;s < end;) {
6048 if (*s != '\\')
6049 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006050 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6051 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006052 }
6053 if (((s - bs) & 1) == 0 ||
6054 s >= end ||
6055 (*s != 'u' && *s != 'U')) {
6056 continue;
6057 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006058 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006059 count = *s=='u' ? 4 : 8;
6060 s++;
6061
6062 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006063 for (x = 0, i = 0; i < count; ++i, ++s) {
6064 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006065 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006066 endinpos = s-starts;
6067 if (unicode_decode_call_errorhandler(
6068 errors, &errorHandler,
6069 "rawunicodeescape", "truncated \\uXXXX",
6070 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006071 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006072 goto onError;
6073 goto nextByte;
6074 }
6075 x = (x<<4) & ~0xF;
6076 if (c >= '0' && c <= '9')
6077 x += c - '0';
6078 else if (c >= 'a' && c <= 'f')
6079 x += 10 + c - 'a';
6080 else
6081 x += 10 + c - 'A';
6082 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006083 if (x <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006084 if (unicode_putchar(&v, &outpos, x) < 0)
6085 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006086 } else {
6087 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006088 if (unicode_decode_call_errorhandler(
6089 errors, &errorHandler,
6090 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006091 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006092 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006093 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006094 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006095 nextByte:
6096 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006097 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006098 if (PyUnicode_Resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006099 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006100 Py_XDECREF(errorHandler);
6101 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006102 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00006103
Benjamin Peterson29060642009-01-31 22:14:21 +00006104 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006105 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006106 Py_XDECREF(errorHandler);
6107 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006108 return NULL;
6109}
6110
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006111
Alexander Belopolsky40018472011-02-26 01:02:56 +00006112PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006113PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006114{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006115 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006116 char *p;
6117 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006118 Py_ssize_t expandsize, pos;
6119 int kind;
6120 void *data;
6121 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006122
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006123 if (!PyUnicode_Check(unicode)) {
6124 PyErr_BadArgument();
6125 return NULL;
6126 }
6127 if (PyUnicode_READY(unicode) < 0)
6128 return NULL;
6129 kind = PyUnicode_KIND(unicode);
6130 data = PyUnicode_DATA(unicode);
6131 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006132 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6133 bytes, and 1 byte characters 4. */
6134 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006135
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006136 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006137 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006138
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006139 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006140 if (repr == NULL)
6141 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006142 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006143 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006144
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006145 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006146 for (pos = 0; pos < len; pos++) {
6147 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006148 /* Map 32-bit characters to '\Uxxxxxxxx' */
6149 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006150 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006151 *p++ = '\\';
6152 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006153 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6154 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6155 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6156 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6157 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6158 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6159 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6160 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006161 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006162 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006163 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006164 *p++ = '\\';
6165 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006166 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6167 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6168 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6169 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006170 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006171 /* Copy everything else as-is */
6172 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006173 *p++ = (char) ch;
6174 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006175
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006176 assert(p > q);
6177 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006178 return NULL;
6179 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006180}
6181
Alexander Belopolsky40018472011-02-26 01:02:56 +00006182PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006183PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6184 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006185{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006186 PyObject *result;
6187 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6188 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006189 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006190 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6191 Py_DECREF(tmp);
6192 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006193}
6194
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006195/* --- Unicode Internal Codec ------------------------------------------- */
6196
Alexander Belopolsky40018472011-02-26 01:02:56 +00006197PyObject *
6198_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006199 Py_ssize_t size,
6200 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006201{
6202 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006203 Py_ssize_t startinpos;
6204 Py_ssize_t endinpos;
6205 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006206 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006207 const char *end;
6208 const char *reason;
6209 PyObject *errorHandler = NULL;
6210 PyObject *exc = NULL;
6211
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006212 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006213 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006214 1))
6215 return NULL;
6216
Thomas Wouters89f507f2006-12-13 04:49:30 +00006217 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006218 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006219 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006220 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006221 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006222 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006223 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006224 end = s + size;
6225
6226 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006227 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006228 Py_UCS4 ch;
6229 /* We copy the raw representation one byte at a time because the
6230 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006231 ((char *) &uch)[0] = s[0];
6232 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006233#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006234 ((char *) &uch)[2] = s[2];
6235 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006236#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006237 ch = uch;
6238
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006239 /* We have to sanity check the raw data, otherwise doom looms for
6240 some malformed UCS-4 data. */
6241 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006242#ifdef Py_UNICODE_WIDE
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006243 ch > 0x10ffff ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006244#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006245 end-s < Py_UNICODE_SIZE
6246 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006247 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006248 startinpos = s - starts;
6249 if (end-s < Py_UNICODE_SIZE) {
6250 endinpos = end-starts;
6251 reason = "truncated input";
6252 }
6253 else {
6254 endinpos = s - starts + Py_UNICODE_SIZE;
6255 reason = "illegal code point (> 0x10FFFF)";
6256 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006257 if (unicode_decode_call_errorhandler(
6258 errors, &errorHandler,
6259 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006260 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006261 &v, &outpos))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006262 goto onError;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006263 continue;
6264 }
6265
6266 s += Py_UNICODE_SIZE;
6267#ifndef Py_UNICODE_WIDE
Victor Stinner551ac952011-11-29 22:58:13 +01006268 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006269 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006270 Py_UNICODE uch2;
6271 ((char *) &uch2)[0] = s[0];
6272 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006273 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006274 {
Victor Stinner551ac952011-11-29 22:58:13 +01006275 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006276 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006277 }
6278 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006279#endif
6280
6281 if (unicode_putchar(&v, &outpos, ch) < 0)
6282 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006283 }
6284
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006285 if (PyUnicode_Resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006286 goto onError;
6287 Py_XDECREF(errorHandler);
6288 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006289 return unicode_result(v);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006290
Benjamin Peterson29060642009-01-31 22:14:21 +00006291 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006292 Py_XDECREF(v);
6293 Py_XDECREF(errorHandler);
6294 Py_XDECREF(exc);
6295 return NULL;
6296}
6297
Guido van Rossumd57fd912000-03-10 22:53:23 +00006298/* --- Latin-1 Codec ------------------------------------------------------ */
6299
Alexander Belopolsky40018472011-02-26 01:02:56 +00006300PyObject *
6301PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006302 Py_ssize_t size,
6303 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006304{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006305 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006306 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006307}
6308
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006309/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006310static void
6311make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006312 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006313 PyObject *unicode,
6314 Py_ssize_t startpos, Py_ssize_t endpos,
6315 const char *reason)
6316{
6317 if (*exceptionObject == NULL) {
6318 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006319 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006320 encoding, unicode, startpos, endpos, reason);
6321 }
6322 else {
6323 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6324 goto onError;
6325 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6326 goto onError;
6327 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6328 goto onError;
6329 return;
6330 onError:
6331 Py_DECREF(*exceptionObject);
6332 *exceptionObject = NULL;
6333 }
6334}
6335
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006336/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006337static void
6338raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006339 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006340 PyObject *unicode,
6341 Py_ssize_t startpos, Py_ssize_t endpos,
6342 const char *reason)
6343{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006344 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006345 encoding, unicode, startpos, endpos, reason);
6346 if (*exceptionObject != NULL)
6347 PyCodec_StrictErrors(*exceptionObject);
6348}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006349
6350/* error handling callback helper:
6351 build arguments, call the callback and check the arguments,
6352 put the result into newpos and return the replacement string, which
6353 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006354static PyObject *
6355unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006356 PyObject **errorHandler,
6357 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006358 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006359 Py_ssize_t startpos, Py_ssize_t endpos,
6360 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006361{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006362 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006363 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006364 PyObject *restuple;
6365 PyObject *resunicode;
6366
6367 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006368 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006369 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006370 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006371 }
6372
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006373 if (PyUnicode_READY(unicode) < 0)
6374 return NULL;
6375 len = PyUnicode_GET_LENGTH(unicode);
6376
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006377 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006378 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006379 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006380 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006381
6382 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006383 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006384 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006385 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006386 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006387 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006388 Py_DECREF(restuple);
6389 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006390 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006391 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006392 &resunicode, newpos)) {
6393 Py_DECREF(restuple);
6394 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006395 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006396 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6397 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6398 Py_DECREF(restuple);
6399 return NULL;
6400 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006401 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006402 *newpos = len + *newpos;
6403 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006404 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6405 Py_DECREF(restuple);
6406 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006407 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006408 Py_INCREF(resunicode);
6409 Py_DECREF(restuple);
6410 return resunicode;
6411}
6412
Alexander Belopolsky40018472011-02-26 01:02:56 +00006413static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006414unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006415 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006416 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006417{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006418 /* input state */
6419 Py_ssize_t pos=0, size;
6420 int kind;
6421 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006422 /* output object */
6423 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006424 /* pointer into the output */
6425 char *str;
6426 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006427 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006428 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6429 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006430 PyObject *errorHandler = NULL;
6431 PyObject *exc = NULL;
6432 /* the following variable is used for caching string comparisons
6433 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6434 int known_errorHandler = -1;
6435
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006436 if (PyUnicode_READY(unicode) < 0)
6437 return NULL;
6438 size = PyUnicode_GET_LENGTH(unicode);
6439 kind = PyUnicode_KIND(unicode);
6440 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006441 /* allocate enough for a simple encoding without
6442 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006443 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006444 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006445 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006446 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006447 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006448 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006449 ressize = size;
6450
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006451 while (pos < size) {
6452 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006453
Benjamin Peterson29060642009-01-31 22:14:21 +00006454 /* can we encode this? */
6455 if (c<limit) {
6456 /* no overflow check, because we know that the space is enough */
6457 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006458 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006459 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006460 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006461 Py_ssize_t requiredsize;
6462 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006463 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006464 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006465 Py_ssize_t collstart = pos;
6466 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006467 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006468 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006469 ++collend;
6470 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6471 if (known_errorHandler==-1) {
6472 if ((errors==NULL) || (!strcmp(errors, "strict")))
6473 known_errorHandler = 1;
6474 else if (!strcmp(errors, "replace"))
6475 known_errorHandler = 2;
6476 else if (!strcmp(errors, "ignore"))
6477 known_errorHandler = 3;
6478 else if (!strcmp(errors, "xmlcharrefreplace"))
6479 known_errorHandler = 4;
6480 else
6481 known_errorHandler = 0;
6482 }
6483 switch (known_errorHandler) {
6484 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006485 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006486 goto onError;
6487 case 2: /* replace */
6488 while (collstart++<collend)
6489 *str++ = '?'; /* fall through */
6490 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006491 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006492 break;
6493 case 4: /* xmlcharrefreplace */
6494 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006495 /* determine replacement size */
6496 for (i = collstart, repsize = 0; i < collend; ++i) {
6497 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6498 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006499 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006500 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006501 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006502 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006503 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006504 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006505 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006506 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006507 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006508 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006509 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006510 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006511 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006512 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006513 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006514 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006515 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006516 if (requiredsize > ressize) {
6517 if (requiredsize<2*ressize)
6518 requiredsize = 2*ressize;
6519 if (_PyBytes_Resize(&res, requiredsize))
6520 goto onError;
6521 str = PyBytes_AS_STRING(res) + respos;
6522 ressize = requiredsize;
6523 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006524 /* generate replacement */
6525 for (i = collstart; i < collend; ++i) {
6526 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006527 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006528 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006529 break;
6530 default:
6531 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006532 encoding, reason, unicode, &exc,
6533 collstart, collend, &newpos);
6534 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
6535 PyUnicode_READY(repunicode) < 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00006536 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006537 if (PyBytes_Check(repunicode)) {
6538 /* Directly copy bytes result to output. */
6539 repsize = PyBytes_Size(repunicode);
6540 if (repsize > 1) {
6541 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006542 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006543 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6544 Py_DECREF(repunicode);
6545 goto onError;
6546 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006547 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006548 ressize += repsize-1;
6549 }
6550 memcpy(str, PyBytes_AsString(repunicode), repsize);
6551 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006552 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006553 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006554 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006555 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006556 /* need more space? (at least enough for what we
6557 have+the replacement+the rest of the string, so
6558 we won't have to check space for encodable characters) */
6559 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006560 repsize = PyUnicode_GET_LENGTH(repunicode);
6561 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006562 if (requiredsize > ressize) {
6563 if (requiredsize<2*ressize)
6564 requiredsize = 2*ressize;
6565 if (_PyBytes_Resize(&res, requiredsize)) {
6566 Py_DECREF(repunicode);
6567 goto onError;
6568 }
6569 str = PyBytes_AS_STRING(res) + respos;
6570 ressize = requiredsize;
6571 }
6572 /* check if there is anything unencodable in the replacement
6573 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006574 for (i = 0; repsize-->0; ++i, ++str) {
6575 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006576 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006577 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006578 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006579 Py_DECREF(repunicode);
6580 goto onError;
6581 }
6582 *str = (char)c;
6583 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006584 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006585 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006586 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006587 }
6588 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006589 /* Resize if we allocated to much */
6590 size = str - PyBytes_AS_STRING(res);
6591 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006592 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006593 if (_PyBytes_Resize(&res, size) < 0)
6594 goto onError;
6595 }
6596
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006597 Py_XDECREF(errorHandler);
6598 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006599 return res;
6600
6601 onError:
6602 Py_XDECREF(res);
6603 Py_XDECREF(errorHandler);
6604 Py_XDECREF(exc);
6605 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006606}
6607
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006608/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006609PyObject *
6610PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006611 Py_ssize_t size,
6612 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006613{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006614 PyObject *result;
6615 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6616 if (unicode == NULL)
6617 return NULL;
6618 result = unicode_encode_ucs1(unicode, errors, 256);
6619 Py_DECREF(unicode);
6620 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006621}
6622
Alexander Belopolsky40018472011-02-26 01:02:56 +00006623PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006624_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006625{
6626 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006627 PyErr_BadArgument();
6628 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006629 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006630 if (PyUnicode_READY(unicode) == -1)
6631 return NULL;
6632 /* Fast path: if it is a one-byte string, construct
6633 bytes object directly. */
6634 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6635 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6636 PyUnicode_GET_LENGTH(unicode));
6637 /* Non-Latin-1 characters present. Defer to above function to
6638 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006639 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006640}
6641
6642PyObject*
6643PyUnicode_AsLatin1String(PyObject *unicode)
6644{
6645 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006646}
6647
6648/* --- 7-bit ASCII Codec -------------------------------------------------- */
6649
Alexander Belopolsky40018472011-02-26 01:02:56 +00006650PyObject *
6651PyUnicode_DecodeASCII(const char *s,
6652 Py_ssize_t size,
6653 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006654{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006655 const char *starts = s;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006656 PyObject *v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006657 int kind;
6658 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006659 Py_ssize_t startinpos;
6660 Py_ssize_t endinpos;
6661 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006662 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006663 int has_error;
6664 const unsigned char *p = (const unsigned char *)s;
6665 const unsigned char *end = p + size;
6666 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006667 PyObject *errorHandler = NULL;
6668 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006669
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006670 if (size == 0) {
6671 Py_INCREF(unicode_empty);
6672 return unicode_empty;
6673 }
6674
Guido van Rossumd57fd912000-03-10 22:53:23 +00006675 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006676 if (size == 1 && (unsigned char)s[0] < 128)
6677 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006678
Victor Stinner702c7342011-10-05 13:50:52 +02006679 has_error = 0;
6680 while (p < end && !has_error) {
6681 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6682 an explanation. */
6683 if (!((size_t) p & LONG_PTR_MASK)) {
6684 /* Help register allocation */
6685 register const unsigned char *_p = p;
6686 while (_p < aligned_end) {
6687 unsigned long value = *(unsigned long *) _p;
6688 if (value & ASCII_CHAR_MASK) {
6689 has_error = 1;
6690 break;
6691 }
6692 _p += SIZEOF_LONG;
6693 }
6694 if (_p == end)
6695 break;
6696 if (has_error)
6697 break;
6698 p = _p;
6699 }
6700 if (*p & 0x80) {
6701 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006702 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006703 }
6704 else {
6705 ++p;
6706 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006707 }
Victor Stinner702c7342011-10-05 13:50:52 +02006708 if (!has_error)
6709 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006710
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006711 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006712 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006713 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006714 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006715 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006716 kind = PyUnicode_KIND(v);
6717 data = PyUnicode_DATA(v);
6718 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006719 e = s + size;
6720 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006721 register unsigned char c = (unsigned char)*s;
6722 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006723 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006724 ++s;
6725 }
6726 else {
6727 startinpos = s-starts;
6728 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006729 if (unicode_decode_call_errorhandler(
6730 errors, &errorHandler,
6731 "ascii", "ordinal not in range(128)",
6732 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006733 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006734 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006735 kind = PyUnicode_KIND(v);
6736 data = PyUnicode_DATA(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006737 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006738 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006739 if (PyUnicode_Resize(&v, outpos) < 0)
6740 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006741 Py_XDECREF(errorHandler);
6742 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006743 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01006744 return v;
Tim Petersced69f82003-09-16 20:30:58 +00006745
Benjamin Peterson29060642009-01-31 22:14:21 +00006746 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006747 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006748 Py_XDECREF(errorHandler);
6749 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006750 return NULL;
6751}
6752
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006753/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006754PyObject *
6755PyUnicode_EncodeASCII(const Py_UNICODE *p,
6756 Py_ssize_t size,
6757 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006758{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006759 PyObject *result;
6760 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6761 if (unicode == NULL)
6762 return NULL;
6763 result = unicode_encode_ucs1(unicode, errors, 128);
6764 Py_DECREF(unicode);
6765 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006766}
6767
Alexander Belopolsky40018472011-02-26 01:02:56 +00006768PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006769_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006770{
6771 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006772 PyErr_BadArgument();
6773 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006774 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006775 if (PyUnicode_READY(unicode) == -1)
6776 return NULL;
6777 /* Fast path: if it is an ASCII-only string, construct bytes object
6778 directly. Else defer to above function to raise the exception. */
6779 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6780 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6781 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006782 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006783}
6784
6785PyObject *
6786PyUnicode_AsASCIIString(PyObject *unicode)
6787{
6788 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006789}
6790
Victor Stinner99b95382011-07-04 14:23:54 +02006791#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006792
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006793/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006794
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006795#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006796#define NEED_RETRY
6797#endif
6798
Victor Stinner3a50e702011-10-18 21:21:00 +02006799#ifndef WC_ERR_INVALID_CHARS
6800# define WC_ERR_INVALID_CHARS 0x0080
6801#endif
6802
6803static char*
6804code_page_name(UINT code_page, PyObject **obj)
6805{
6806 *obj = NULL;
6807 if (code_page == CP_ACP)
6808 return "mbcs";
6809 if (code_page == CP_UTF7)
6810 return "CP_UTF7";
6811 if (code_page == CP_UTF8)
6812 return "CP_UTF8";
6813
6814 *obj = PyBytes_FromFormat("cp%u", code_page);
6815 if (*obj == NULL)
6816 return NULL;
6817 return PyBytes_AS_STRING(*obj);
6818}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006819
Alexander Belopolsky40018472011-02-26 01:02:56 +00006820static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006821is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006822{
6823 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006824 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006825
Victor Stinner3a50e702011-10-18 21:21:00 +02006826 if (!IsDBCSLeadByteEx(code_page, *curr))
6827 return 0;
6828
6829 prev = CharPrevExA(code_page, s, curr, 0);
6830 if (prev == curr)
6831 return 1;
6832 /* FIXME: This code is limited to "true" double-byte encodings,
6833 as it assumes an incomplete character consists of a single
6834 byte. */
6835 if (curr - prev == 2)
6836 return 1;
6837 if (!IsDBCSLeadByteEx(code_page, *prev))
6838 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006839 return 0;
6840}
6841
Victor Stinner3a50e702011-10-18 21:21:00 +02006842static DWORD
6843decode_code_page_flags(UINT code_page)
6844{
6845 if (code_page == CP_UTF7) {
6846 /* The CP_UTF7 decoder only supports flags=0 */
6847 return 0;
6848 }
6849 else
6850 return MB_ERR_INVALID_CHARS;
6851}
6852
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006853/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006854 * Decode a byte string from a Windows code page into unicode object in strict
6855 * mode.
6856 *
6857 * Returns consumed size if succeed, returns -2 on decode error, or raise a
6858 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006859 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006860static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006861decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006862 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006863 const char *in,
6864 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006865{
Victor Stinner3a50e702011-10-18 21:21:00 +02006866 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006867 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006868 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006869
6870 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006871 assert(insize > 0);
6872 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6873 if (outsize <= 0)
6874 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006875
6876 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006877 /* Create unicode object */
Victor Stinner76a31a62011-11-04 00:05:13 +01006878 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006879 if (*v == NULL)
6880 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006881 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006882 }
6883 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006884 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006885 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner76a31a62011-11-04 00:05:13 +01006886 if (PyUnicode_Resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006887 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006888 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006889 }
6890
6891 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006892 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6893 if (outsize <= 0)
6894 goto error;
6895 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006896
Victor Stinner3a50e702011-10-18 21:21:00 +02006897error:
6898 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6899 return -2;
6900 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006901 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006902}
6903
Victor Stinner3a50e702011-10-18 21:21:00 +02006904/*
6905 * Decode a byte string from a code page into unicode object with an error
6906 * handler.
6907 *
6908 * Returns consumed size if succeed, or raise a WindowsError or
6909 * UnicodeDecodeError exception and returns -1 on error.
6910 */
6911static int
6912decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006913 PyObject **v,
6914 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006915 const char *errors)
6916{
6917 const char *startin = in;
6918 const char *endin = in + size;
6919 const DWORD flags = decode_code_page_flags(code_page);
6920 /* Ideally, we should get reason from FormatMessage. This is the Windows
6921 2000 English version of the message. */
6922 const char *reason = "No mapping for the Unicode character exists "
6923 "in the target code page.";
6924 /* each step cannot decode more than 1 character, but a character can be
6925 represented as a surrogate pair */
6926 wchar_t buffer[2], *startout, *out;
6927 int insize, outsize;
6928 PyObject *errorHandler = NULL;
6929 PyObject *exc = NULL;
6930 PyObject *encoding_obj = NULL;
6931 char *encoding;
6932 DWORD err;
6933 int ret = -1;
6934
6935 assert(size > 0);
6936
6937 encoding = code_page_name(code_page, &encoding_obj);
6938 if (encoding == NULL)
6939 return -1;
6940
6941 if (errors == NULL || strcmp(errors, "strict") == 0) {
6942 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6943 UnicodeDecodeError. */
6944 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6945 if (exc != NULL) {
6946 PyCodec_StrictErrors(exc);
6947 Py_CLEAR(exc);
6948 }
6949 goto error;
6950 }
6951
6952 if (*v == NULL) {
6953 /* Create unicode object */
6954 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6955 PyErr_NoMemory();
6956 goto error;
6957 }
Victor Stinner76a31a62011-11-04 00:05:13 +01006958 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006959 if (*v == NULL)
6960 goto error;
6961 startout = PyUnicode_AS_UNICODE(*v);
6962 }
6963 else {
6964 /* Extend unicode object */
6965 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6966 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6967 PyErr_NoMemory();
6968 goto error;
6969 }
Victor Stinner76a31a62011-11-04 00:05:13 +01006970 if (PyUnicode_Resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006971 goto error;
6972 startout = PyUnicode_AS_UNICODE(*v) + n;
6973 }
6974
6975 /* Decode the byte string character per character */
6976 out = startout;
6977 while (in < endin)
6978 {
6979 /* Decode a character */
6980 insize = 1;
6981 do
6982 {
6983 outsize = MultiByteToWideChar(code_page, flags,
6984 in, insize,
6985 buffer, Py_ARRAY_LENGTH(buffer));
6986 if (outsize > 0)
6987 break;
6988 err = GetLastError();
6989 if (err != ERROR_NO_UNICODE_TRANSLATION
6990 && err != ERROR_INSUFFICIENT_BUFFER)
6991 {
6992 PyErr_SetFromWindowsErr(0);
6993 goto error;
6994 }
6995 insize++;
6996 }
6997 /* 4=maximum length of a UTF-8 sequence */
6998 while (insize <= 4 && (in + insize) <= endin);
6999
7000 if (outsize <= 0) {
7001 Py_ssize_t startinpos, endinpos, outpos;
7002
7003 startinpos = in - startin;
7004 endinpos = startinpos + 1;
7005 outpos = out - PyUnicode_AS_UNICODE(*v);
7006 if (unicode_decode_call_errorhandler(
7007 errors, &errorHandler,
7008 encoding, reason,
7009 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007010 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007011 {
7012 goto error;
7013 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007014 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007015 }
7016 else {
7017 in += insize;
7018 memcpy(out, buffer, outsize * sizeof(wchar_t));
7019 out += outsize;
7020 }
7021 }
7022
7023 /* write a NUL character at the end */
7024 *out = 0;
7025
7026 /* Extend unicode object */
7027 outsize = out - startout;
7028 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner76a31a62011-11-04 00:05:13 +01007029 if (PyUnicode_Resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007030 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01007031 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007032
7033error:
7034 Py_XDECREF(encoding_obj);
7035 Py_XDECREF(errorHandler);
7036 Py_XDECREF(exc);
7037 return ret;
7038}
7039
Victor Stinner3a50e702011-10-18 21:21:00 +02007040static PyObject *
7041decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007042 const char *s, Py_ssize_t size,
7043 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007044{
Victor Stinner76a31a62011-11-04 00:05:13 +01007045 PyObject *v = NULL;
7046 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007047
Victor Stinner3a50e702011-10-18 21:21:00 +02007048 if (code_page < 0) {
7049 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7050 return NULL;
7051 }
7052
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007053 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007054 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007055
Victor Stinner76a31a62011-11-04 00:05:13 +01007056 do
7057 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007058#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007059 if (size > INT_MAX) {
7060 chunk_size = INT_MAX;
7061 final = 0;
7062 done = 0;
7063 }
7064 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007065#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007066 {
7067 chunk_size = (int)size;
7068 final = (consumed == NULL);
7069 done = 1;
7070 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007071
Victor Stinner76a31a62011-11-04 00:05:13 +01007072 /* Skip trailing lead-byte unless 'final' is set */
7073 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
7074 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007075
Victor Stinner76a31a62011-11-04 00:05:13 +01007076 if (chunk_size == 0 && done) {
7077 if (v != NULL)
7078 break;
7079 Py_INCREF(unicode_empty);
7080 return unicode_empty;
7081 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007082
Victor Stinner76a31a62011-11-04 00:05:13 +01007083
7084 converted = decode_code_page_strict(code_page, &v,
7085 s, chunk_size);
7086 if (converted == -2)
7087 converted = decode_code_page_errors(code_page, &v,
7088 s, chunk_size,
7089 errors);
7090 assert(converted != 0);
7091
7092 if (converted < 0) {
7093 Py_XDECREF(v);
7094 return NULL;
7095 }
7096
7097 if (consumed)
7098 *consumed += converted;
7099
7100 s += converted;
7101 size -= converted;
7102 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007103
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007104 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007105}
7106
Alexander Belopolsky40018472011-02-26 01:02:56 +00007107PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007108PyUnicode_DecodeCodePageStateful(int code_page,
7109 const char *s,
7110 Py_ssize_t size,
7111 const char *errors,
7112 Py_ssize_t *consumed)
7113{
7114 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7115}
7116
7117PyObject *
7118PyUnicode_DecodeMBCSStateful(const char *s,
7119 Py_ssize_t size,
7120 const char *errors,
7121 Py_ssize_t *consumed)
7122{
7123 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7124}
7125
7126PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007127PyUnicode_DecodeMBCS(const char *s,
7128 Py_ssize_t size,
7129 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007130{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007131 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7132}
7133
Victor Stinner3a50e702011-10-18 21:21:00 +02007134static DWORD
7135encode_code_page_flags(UINT code_page, const char *errors)
7136{
7137 if (code_page == CP_UTF8) {
7138 if (winver.dwMajorVersion >= 6)
7139 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7140 and later */
7141 return WC_ERR_INVALID_CHARS;
7142 else
7143 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7144 return 0;
7145 }
7146 else if (code_page == CP_UTF7) {
7147 /* CP_UTF7 only supports flags=0 */
7148 return 0;
7149 }
7150 else {
7151 if (errors != NULL && strcmp(errors, "replace") == 0)
7152 return 0;
7153 else
7154 return WC_NO_BEST_FIT_CHARS;
7155 }
7156}
7157
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007158/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007159 * Encode a Unicode string to a Windows code page into a byte string in strict
7160 * mode.
7161 *
7162 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7163 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007164 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007165static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007166encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007167 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007168 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007169{
Victor Stinner554f3f02010-06-16 23:33:54 +00007170 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007171 BOOL *pusedDefaultChar = &usedDefaultChar;
7172 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007173 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007174 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007175 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007176 const DWORD flags = encode_code_page_flags(code_page, NULL);
7177 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007178 /* Create a substring so that we can get the UTF-16 representation
7179 of just the slice under consideration. */
7180 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007181
Martin v. Löwis3d325192011-11-04 18:23:06 +01007182 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007183
Victor Stinner3a50e702011-10-18 21:21:00 +02007184 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007185 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007186 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007187 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007188
Victor Stinner2fc507f2011-11-04 20:06:39 +01007189 substring = PyUnicode_Substring(unicode, offset, offset+len);
7190 if (substring == NULL)
7191 return -1;
7192 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7193 if (p == NULL) {
7194 Py_DECREF(substring);
7195 return -1;
7196 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01007197
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007198 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007199 outsize = WideCharToMultiByte(code_page, flags,
7200 p, size,
7201 NULL, 0,
7202 NULL, pusedDefaultChar);
7203 if (outsize <= 0)
7204 goto error;
7205 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007206 if (pusedDefaultChar && *pusedDefaultChar) {
7207 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007208 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007209 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007210
Victor Stinner3a50e702011-10-18 21:21:00 +02007211 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007212 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007213 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007214 if (*outbytes == NULL) {
7215 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007216 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007217 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007218 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007219 }
7220 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007221 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007222 const Py_ssize_t n = PyBytes_Size(*outbytes);
7223 if (outsize > PY_SSIZE_T_MAX - n) {
7224 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007225 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007226 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007227 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007228 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7229 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007230 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007231 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007232 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007233 }
7234
7235 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007236 outsize = WideCharToMultiByte(code_page, flags,
7237 p, size,
7238 out, outsize,
7239 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007240 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007241 if (outsize <= 0)
7242 goto error;
7243 if (pusedDefaultChar && *pusedDefaultChar)
7244 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007245 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007246
Victor Stinner3a50e702011-10-18 21:21:00 +02007247error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007248 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007249 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7250 return -2;
7251 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007252 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007253}
7254
Victor Stinner3a50e702011-10-18 21:21:00 +02007255/*
7256 * Encode a Unicode string to a Windows code page into a byte string using a
7257 * error handler.
7258 *
7259 * Returns consumed characters if succeed, or raise a WindowsError and returns
7260 * -1 on other error.
7261 */
7262static int
7263encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007264 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007265 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007266{
Victor Stinner3a50e702011-10-18 21:21:00 +02007267 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007268 Py_ssize_t pos = unicode_offset;
7269 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007270 /* Ideally, we should get reason from FormatMessage. This is the Windows
7271 2000 English version of the message. */
7272 const char *reason = "invalid character";
7273 /* 4=maximum length of a UTF-8 sequence */
7274 char buffer[4];
7275 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7276 Py_ssize_t outsize;
7277 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007278 PyObject *errorHandler = NULL;
7279 PyObject *exc = NULL;
7280 PyObject *encoding_obj = NULL;
7281 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007282 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007283 PyObject *rep;
7284 int ret = -1;
7285
7286 assert(insize > 0);
7287
7288 encoding = code_page_name(code_page, &encoding_obj);
7289 if (encoding == NULL)
7290 return -1;
7291
7292 if (errors == NULL || strcmp(errors, "strict") == 0) {
7293 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7294 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007295 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007296 if (exc != NULL) {
7297 PyCodec_StrictErrors(exc);
7298 Py_DECREF(exc);
7299 }
7300 Py_XDECREF(encoding_obj);
7301 return -1;
7302 }
7303
7304 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7305 pusedDefaultChar = &usedDefaultChar;
7306 else
7307 pusedDefaultChar = NULL;
7308
7309 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7310 PyErr_NoMemory();
7311 goto error;
7312 }
7313 outsize = insize * Py_ARRAY_LENGTH(buffer);
7314
7315 if (*outbytes == NULL) {
7316 /* Create string object */
7317 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7318 if (*outbytes == NULL)
7319 goto error;
7320 out = PyBytes_AS_STRING(*outbytes);
7321 }
7322 else {
7323 /* Extend string object */
7324 Py_ssize_t n = PyBytes_Size(*outbytes);
7325 if (n > PY_SSIZE_T_MAX - outsize) {
7326 PyErr_NoMemory();
7327 goto error;
7328 }
7329 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7330 goto error;
7331 out = PyBytes_AS_STRING(*outbytes) + n;
7332 }
7333
7334 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007335 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007336 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007337 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7338 wchar_t chars[2];
7339 int charsize;
7340 if (ch < 0x10000) {
7341 chars[0] = (wchar_t)ch;
7342 charsize = 1;
7343 }
7344 else {
7345 ch -= 0x10000;
7346 chars[0] = 0xd800 + (ch >> 10);
7347 chars[1] = 0xdc00 + (ch & 0x3ff);
7348 charsize = 2;
7349 }
7350
Victor Stinner3a50e702011-10-18 21:21:00 +02007351 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007352 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007353 buffer, Py_ARRAY_LENGTH(buffer),
7354 NULL, pusedDefaultChar);
7355 if (outsize > 0) {
7356 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7357 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007358 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007359 memcpy(out, buffer, outsize);
7360 out += outsize;
7361 continue;
7362 }
7363 }
7364 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7365 PyErr_SetFromWindowsErr(0);
7366 goto error;
7367 }
7368
Victor Stinner3a50e702011-10-18 21:21:00 +02007369 rep = unicode_encode_call_errorhandler(
7370 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007371 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007372 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007373 if (rep == NULL)
7374 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007375 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007376
7377 if (PyBytes_Check(rep)) {
7378 outsize = PyBytes_GET_SIZE(rep);
7379 if (outsize != 1) {
7380 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7381 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7382 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7383 Py_DECREF(rep);
7384 goto error;
7385 }
7386 out = PyBytes_AS_STRING(*outbytes) + offset;
7387 }
7388 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7389 out += outsize;
7390 }
7391 else {
7392 Py_ssize_t i;
7393 enum PyUnicode_Kind kind;
7394 void *data;
7395
7396 if (PyUnicode_READY(rep) < 0) {
7397 Py_DECREF(rep);
7398 goto error;
7399 }
7400
7401 outsize = PyUnicode_GET_LENGTH(rep);
7402 if (outsize != 1) {
7403 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7404 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7405 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7406 Py_DECREF(rep);
7407 goto error;
7408 }
7409 out = PyBytes_AS_STRING(*outbytes) + offset;
7410 }
7411 kind = PyUnicode_KIND(rep);
7412 data = PyUnicode_DATA(rep);
7413 for (i=0; i < outsize; i++) {
7414 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7415 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007416 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007417 encoding, unicode,
7418 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007419 "unable to encode error handler result to ASCII");
7420 Py_DECREF(rep);
7421 goto error;
7422 }
7423 *out = (unsigned char)ch;
7424 out++;
7425 }
7426 }
7427 Py_DECREF(rep);
7428 }
7429 /* write a NUL byte */
7430 *out = 0;
7431 outsize = out - PyBytes_AS_STRING(*outbytes);
7432 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7433 if (_PyBytes_Resize(outbytes, outsize) < 0)
7434 goto error;
7435 ret = 0;
7436
7437error:
7438 Py_XDECREF(encoding_obj);
7439 Py_XDECREF(errorHandler);
7440 Py_XDECREF(exc);
7441 return ret;
7442}
7443
Victor Stinner3a50e702011-10-18 21:21:00 +02007444static PyObject *
7445encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007446 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007447 const char *errors)
7448{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007449 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007450 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007451 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007452 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007453
Victor Stinner2fc507f2011-11-04 20:06:39 +01007454 if (PyUnicode_READY(unicode) < 0)
7455 return NULL;
7456 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007457
Victor Stinner3a50e702011-10-18 21:21:00 +02007458 if (code_page < 0) {
7459 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7460 return NULL;
7461 }
7462
Martin v. Löwis3d325192011-11-04 18:23:06 +01007463 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007464 return PyBytes_FromStringAndSize(NULL, 0);
7465
Victor Stinner7581cef2011-11-03 22:32:33 +01007466 offset = 0;
7467 do
7468 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007469#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007470 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007471 chunks. */
7472 if (len > INT_MAX/2) {
7473 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007474 done = 0;
7475 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007476 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007477#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007478 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007479 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007480 done = 1;
7481 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007482
Victor Stinner76a31a62011-11-04 00:05:13 +01007483 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007484 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007485 errors);
7486 if (ret == -2)
7487 ret = encode_code_page_errors(code_page, &outbytes,
7488 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007489 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007490 if (ret < 0) {
7491 Py_XDECREF(outbytes);
7492 return NULL;
7493 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007494
Victor Stinner7581cef2011-11-03 22:32:33 +01007495 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007496 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007497 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007498
Victor Stinner3a50e702011-10-18 21:21:00 +02007499 return outbytes;
7500}
7501
7502PyObject *
7503PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7504 Py_ssize_t size,
7505 const char *errors)
7506{
Victor Stinner7581cef2011-11-03 22:32:33 +01007507 PyObject *unicode, *res;
7508 unicode = PyUnicode_FromUnicode(p, size);
7509 if (unicode == NULL)
7510 return NULL;
7511 res = encode_code_page(CP_ACP, unicode, errors);
7512 Py_DECREF(unicode);
7513 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007514}
7515
7516PyObject *
7517PyUnicode_EncodeCodePage(int code_page,
7518 PyObject *unicode,
7519 const char *errors)
7520{
Victor Stinner7581cef2011-11-03 22:32:33 +01007521 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007522}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007523
Alexander Belopolsky40018472011-02-26 01:02:56 +00007524PyObject *
7525PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007526{
7527 if (!PyUnicode_Check(unicode)) {
7528 PyErr_BadArgument();
7529 return NULL;
7530 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007531 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007532}
7533
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007534#undef NEED_RETRY
7535
Victor Stinner99b95382011-07-04 14:23:54 +02007536#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007537
Guido van Rossumd57fd912000-03-10 22:53:23 +00007538/* --- Character Mapping Codec -------------------------------------------- */
7539
Alexander Belopolsky40018472011-02-26 01:02:56 +00007540PyObject *
7541PyUnicode_DecodeCharmap(const char *s,
7542 Py_ssize_t size,
7543 PyObject *mapping,
7544 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007545{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007546 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007547 Py_ssize_t startinpos;
7548 Py_ssize_t endinpos;
7549 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007550 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007551 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007552 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007553 PyObject *errorHandler = NULL;
7554 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007555
Guido van Rossumd57fd912000-03-10 22:53:23 +00007556 /* Default to Latin-1 */
7557 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007558 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007559
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007560 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007561 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007562 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007563 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007564 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007565 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007566 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007567 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007568 Py_ssize_t maplen;
7569 enum PyUnicode_Kind kind;
7570 void *data;
7571 Py_UCS4 x;
7572
7573 if (PyUnicode_READY(mapping) < 0)
7574 return NULL;
7575
7576 maplen = PyUnicode_GET_LENGTH(mapping);
7577 data = PyUnicode_DATA(mapping);
7578 kind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007579 while (s < e) {
7580 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007581
Benjamin Peterson29060642009-01-31 22:14:21 +00007582 if (ch < maplen)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007583 x = PyUnicode_READ(kind, data, ch);
7584 else
7585 x = 0xfffe; /* invalid value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007586
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007587 if (x == 0xfffe)
7588 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007589 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007590 startinpos = s-starts;
7591 endinpos = startinpos+1;
7592 if (unicode_decode_call_errorhandler(
7593 errors, &errorHandler,
7594 "charmap", "character maps to <undefined>",
7595 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007596 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007597 goto onError;
7598 }
7599 continue;
7600 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007601
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007602 if (unicode_putchar(&v, &outpos, x) < 0)
7603 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007604 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007605 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007606 }
7607 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007608 while (s < e) {
7609 unsigned char ch = *s;
7610 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007611
Benjamin Peterson29060642009-01-31 22:14:21 +00007612 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7613 w = PyLong_FromLong((long)ch);
7614 if (w == NULL)
7615 goto onError;
7616 x = PyObject_GetItem(mapping, w);
7617 Py_DECREF(w);
7618 if (x == NULL) {
7619 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7620 /* No mapping found means: mapping is undefined. */
7621 PyErr_Clear();
7622 x = Py_None;
7623 Py_INCREF(x);
7624 } else
7625 goto onError;
7626 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007627
Benjamin Peterson29060642009-01-31 22:14:21 +00007628 /* Apply mapping */
7629 if (PyLong_Check(x)) {
7630 long value = PyLong_AS_LONG(x);
7631 if (value < 0 || value > 65535) {
7632 PyErr_SetString(PyExc_TypeError,
7633 "character mapping must be in range(65536)");
7634 Py_DECREF(x);
7635 goto onError;
7636 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007637 if (unicode_putchar(&v, &outpos, value) < 0)
7638 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007639 }
7640 else if (x == Py_None) {
7641 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007642 startinpos = s-starts;
7643 endinpos = startinpos+1;
7644 if (unicode_decode_call_errorhandler(
7645 errors, &errorHandler,
7646 "charmap", "character maps to <undefined>",
7647 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007648 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007649 Py_DECREF(x);
7650 goto onError;
7651 }
7652 Py_DECREF(x);
7653 continue;
7654 }
7655 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007656 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007657
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007658 if (PyUnicode_READY(x) < 0)
7659 goto onError;
7660 targetsize = PyUnicode_GET_LENGTH(x);
7661
7662 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007663 /* 1-1 mapping */
Victor Stinner62aa4d02011-11-09 00:03:45 +01007664 if (unicode_putchar(&v, &outpos,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007665 PyUnicode_READ_CHAR(x, 0)) < 0)
7666 goto onError;
7667 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007668 else if (targetsize > 1) {
7669 /* 1-n mapping */
7670 if (targetsize > extrachars) {
7671 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007672 Py_ssize_t needed = (targetsize - extrachars) + \
7673 (targetsize << 2);
7674 extrachars += needed;
7675 /* XXX overflow detection missing */
Victor Stinner7931d9a2011-11-04 00:22:48 +01007676 if (PyUnicode_Resize(&v,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007677 PyUnicode_GET_LENGTH(v) + needed) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007678 Py_DECREF(x);
7679 goto onError;
7680 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007681 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007682 if (unicode_widen(&v, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
7683 goto onError;
7684 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7685 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00007686 extrachars -= targetsize;
7687 }
7688 /* 1-0 mapping: skip the character */
7689 }
7690 else {
7691 /* wrong return value */
7692 PyErr_SetString(PyExc_TypeError,
7693 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007694 Py_DECREF(x);
7695 goto onError;
7696 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007697 Py_DECREF(x);
7698 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007699 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007700 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007701 if (PyUnicode_Resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01007702 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007703 Py_XDECREF(errorHandler);
7704 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007705 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00007706
Benjamin Peterson29060642009-01-31 22:14:21 +00007707 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007708 Py_XDECREF(errorHandler);
7709 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007710 Py_XDECREF(v);
7711 return NULL;
7712}
7713
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007714/* Charmap encoding: the lookup table */
7715
Alexander Belopolsky40018472011-02-26 01:02:56 +00007716struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007717 PyObject_HEAD
7718 unsigned char level1[32];
7719 int count2, count3;
7720 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007721};
7722
7723static PyObject*
7724encoding_map_size(PyObject *obj, PyObject* args)
7725{
7726 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007727 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007728 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007729}
7730
7731static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007732 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007733 PyDoc_STR("Return the size (in bytes) of this object") },
7734 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007735};
7736
7737static void
7738encoding_map_dealloc(PyObject* o)
7739{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007740 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007741}
7742
7743static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007744 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007745 "EncodingMap", /*tp_name*/
7746 sizeof(struct encoding_map), /*tp_basicsize*/
7747 0, /*tp_itemsize*/
7748 /* methods */
7749 encoding_map_dealloc, /*tp_dealloc*/
7750 0, /*tp_print*/
7751 0, /*tp_getattr*/
7752 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007753 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007754 0, /*tp_repr*/
7755 0, /*tp_as_number*/
7756 0, /*tp_as_sequence*/
7757 0, /*tp_as_mapping*/
7758 0, /*tp_hash*/
7759 0, /*tp_call*/
7760 0, /*tp_str*/
7761 0, /*tp_getattro*/
7762 0, /*tp_setattro*/
7763 0, /*tp_as_buffer*/
7764 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7765 0, /*tp_doc*/
7766 0, /*tp_traverse*/
7767 0, /*tp_clear*/
7768 0, /*tp_richcompare*/
7769 0, /*tp_weaklistoffset*/
7770 0, /*tp_iter*/
7771 0, /*tp_iternext*/
7772 encoding_map_methods, /*tp_methods*/
7773 0, /*tp_members*/
7774 0, /*tp_getset*/
7775 0, /*tp_base*/
7776 0, /*tp_dict*/
7777 0, /*tp_descr_get*/
7778 0, /*tp_descr_set*/
7779 0, /*tp_dictoffset*/
7780 0, /*tp_init*/
7781 0, /*tp_alloc*/
7782 0, /*tp_new*/
7783 0, /*tp_free*/
7784 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007785};
7786
7787PyObject*
7788PyUnicode_BuildEncodingMap(PyObject* string)
7789{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007790 PyObject *result;
7791 struct encoding_map *mresult;
7792 int i;
7793 int need_dict = 0;
7794 unsigned char level1[32];
7795 unsigned char level2[512];
7796 unsigned char *mlevel1, *mlevel2, *mlevel3;
7797 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007798 int kind;
7799 void *data;
7800 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007801
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007802 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007803 PyErr_BadArgument();
7804 return NULL;
7805 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007806 kind = PyUnicode_KIND(string);
7807 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007808 memset(level1, 0xFF, sizeof level1);
7809 memset(level2, 0xFF, sizeof level2);
7810
7811 /* If there isn't a one-to-one mapping of NULL to \0,
7812 or if there are non-BMP characters, we need to use
7813 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007814 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007815 need_dict = 1;
7816 for (i = 1; i < 256; i++) {
7817 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007818 ch = PyUnicode_READ(kind, data, i);
7819 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007820 need_dict = 1;
7821 break;
7822 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007823 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007824 /* unmapped character */
7825 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007826 l1 = ch >> 11;
7827 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007828 if (level1[l1] == 0xFF)
7829 level1[l1] = count2++;
7830 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007831 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007832 }
7833
7834 if (count2 >= 0xFF || count3 >= 0xFF)
7835 need_dict = 1;
7836
7837 if (need_dict) {
7838 PyObject *result = PyDict_New();
7839 PyObject *key, *value;
7840 if (!result)
7841 return NULL;
7842 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007843 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007844 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007845 if (!key || !value)
7846 goto failed1;
7847 if (PyDict_SetItem(result, key, value) == -1)
7848 goto failed1;
7849 Py_DECREF(key);
7850 Py_DECREF(value);
7851 }
7852 return result;
7853 failed1:
7854 Py_XDECREF(key);
7855 Py_XDECREF(value);
7856 Py_DECREF(result);
7857 return NULL;
7858 }
7859
7860 /* Create a three-level trie */
7861 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7862 16*count2 + 128*count3 - 1);
7863 if (!result)
7864 return PyErr_NoMemory();
7865 PyObject_Init(result, &EncodingMapType);
7866 mresult = (struct encoding_map*)result;
7867 mresult->count2 = count2;
7868 mresult->count3 = count3;
7869 mlevel1 = mresult->level1;
7870 mlevel2 = mresult->level23;
7871 mlevel3 = mresult->level23 + 16*count2;
7872 memcpy(mlevel1, level1, 32);
7873 memset(mlevel2, 0xFF, 16*count2);
7874 memset(mlevel3, 0, 128*count3);
7875 count3 = 0;
7876 for (i = 1; i < 256; i++) {
7877 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007878 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007879 /* unmapped character */
7880 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007881 o1 = PyUnicode_READ(kind, data, i)>>11;
7882 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007883 i2 = 16*mlevel1[o1] + o2;
7884 if (mlevel2[i2] == 0xFF)
7885 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007886 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007887 i3 = 128*mlevel2[i2] + o3;
7888 mlevel3[i3] = i;
7889 }
7890 return result;
7891}
7892
7893static int
Victor Stinner22168992011-11-20 17:09:18 +01007894encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007895{
7896 struct encoding_map *map = (struct encoding_map*)mapping;
7897 int l1 = c>>11;
7898 int l2 = (c>>7) & 0xF;
7899 int l3 = c & 0x7F;
7900 int i;
7901
Victor Stinner22168992011-11-20 17:09:18 +01007902 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007903 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007904 if (c == 0)
7905 return 0;
7906 /* level 1*/
7907 i = map->level1[l1];
7908 if (i == 0xFF) {
7909 return -1;
7910 }
7911 /* level 2*/
7912 i = map->level23[16*i+l2];
7913 if (i == 0xFF) {
7914 return -1;
7915 }
7916 /* level 3 */
7917 i = map->level23[16*map->count2 + 128*i + l3];
7918 if (i == 0) {
7919 return -1;
7920 }
7921 return i;
7922}
7923
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007924/* Lookup the character ch in the mapping. If the character
7925 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007926 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007927static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007928charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007929{
Christian Heimes217cfd12007-12-02 14:31:20 +00007930 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007931 PyObject *x;
7932
7933 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007934 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007935 x = PyObject_GetItem(mapping, w);
7936 Py_DECREF(w);
7937 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007938 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7939 /* No mapping found means: mapping is undefined. */
7940 PyErr_Clear();
7941 x = Py_None;
7942 Py_INCREF(x);
7943 return x;
7944 } else
7945 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007946 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007947 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007948 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007949 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007950 long value = PyLong_AS_LONG(x);
7951 if (value < 0 || value > 255) {
7952 PyErr_SetString(PyExc_TypeError,
7953 "character mapping must be in range(256)");
7954 Py_DECREF(x);
7955 return NULL;
7956 }
7957 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007958 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007959 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007960 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007961 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007962 /* wrong return value */
7963 PyErr_Format(PyExc_TypeError,
7964 "character mapping must return integer, bytes or None, not %.400s",
7965 x->ob_type->tp_name);
7966 Py_DECREF(x);
7967 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007968 }
7969}
7970
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007971static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007972charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007973{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007974 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7975 /* exponentially overallocate to minimize reallocations */
7976 if (requiredsize < 2*outsize)
7977 requiredsize = 2*outsize;
7978 if (_PyBytes_Resize(outobj, requiredsize))
7979 return -1;
7980 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007981}
7982
Benjamin Peterson14339b62009-01-31 16:36:08 +00007983typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007984 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007985} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007986/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007987 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007988 space is available. Return a new reference to the object that
7989 was put in the output buffer, or Py_None, if the mapping was undefined
7990 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007991 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007992static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01007993charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007994 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007995{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007996 PyObject *rep;
7997 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007998 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007999
Christian Heimes90aa7642007-12-19 02:45:37 +00008000 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008001 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008002 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008003 if (res == -1)
8004 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008005 if (outsize<requiredsize)
8006 if (charmapencode_resize(outobj, outpos, requiredsize))
8007 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008008 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008009 outstart[(*outpos)++] = (char)res;
8010 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008011 }
8012
8013 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008014 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008015 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008016 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008017 Py_DECREF(rep);
8018 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008019 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008020 if (PyLong_Check(rep)) {
8021 Py_ssize_t requiredsize = *outpos+1;
8022 if (outsize<requiredsize)
8023 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8024 Py_DECREF(rep);
8025 return enc_EXCEPTION;
8026 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008027 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008028 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008029 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008030 else {
8031 const char *repchars = PyBytes_AS_STRING(rep);
8032 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8033 Py_ssize_t requiredsize = *outpos+repsize;
8034 if (outsize<requiredsize)
8035 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8036 Py_DECREF(rep);
8037 return enc_EXCEPTION;
8038 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008039 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008040 memcpy(outstart + *outpos, repchars, repsize);
8041 *outpos += repsize;
8042 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008043 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008044 Py_DECREF(rep);
8045 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008046}
8047
8048/* handle an error in PyUnicode_EncodeCharmap
8049 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008050static int
8051charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008052 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008053 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008054 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008055 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008056{
8057 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008058 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008059 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008060 enum PyUnicode_Kind kind;
8061 void *data;
8062 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008063 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008064 Py_ssize_t collstartpos = *inpos;
8065 Py_ssize_t collendpos = *inpos+1;
8066 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008067 char *encoding = "charmap";
8068 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008069 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008070 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008071 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008072
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008073 if (PyUnicode_READY(unicode) < 0)
8074 return -1;
8075 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008076 /* find all unencodable characters */
8077 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008078 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008079 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008080 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008081 val = encoding_map_lookup(ch, mapping);
8082 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008083 break;
8084 ++collendpos;
8085 continue;
8086 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008087
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008088 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8089 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008090 if (rep==NULL)
8091 return -1;
8092 else if (rep!=Py_None) {
8093 Py_DECREF(rep);
8094 break;
8095 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008096 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008097 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008098 }
8099 /* cache callback name lookup
8100 * (if not done yet, i.e. it's the first error) */
8101 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008102 if ((errors==NULL) || (!strcmp(errors, "strict")))
8103 *known_errorHandler = 1;
8104 else if (!strcmp(errors, "replace"))
8105 *known_errorHandler = 2;
8106 else if (!strcmp(errors, "ignore"))
8107 *known_errorHandler = 3;
8108 else if (!strcmp(errors, "xmlcharrefreplace"))
8109 *known_errorHandler = 4;
8110 else
8111 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008112 }
8113 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008114 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008115 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008116 return -1;
8117 case 2: /* replace */
8118 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008119 x = charmapencode_output('?', mapping, res, respos);
8120 if (x==enc_EXCEPTION) {
8121 return -1;
8122 }
8123 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008124 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008125 return -1;
8126 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008127 }
8128 /* fall through */
8129 case 3: /* ignore */
8130 *inpos = collendpos;
8131 break;
8132 case 4: /* xmlcharrefreplace */
8133 /* generate replacement (temporarily (mis)uses p) */
8134 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008135 char buffer[2+29+1+1];
8136 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008137 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008138 for (cp = buffer; *cp; ++cp) {
8139 x = charmapencode_output(*cp, mapping, res, respos);
8140 if (x==enc_EXCEPTION)
8141 return -1;
8142 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008143 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008144 return -1;
8145 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008146 }
8147 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008148 *inpos = collendpos;
8149 break;
8150 default:
8151 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008152 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008153 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008154 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008155 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008156 if (PyBytes_Check(repunicode)) {
8157 /* Directly copy bytes result to output. */
8158 Py_ssize_t outsize = PyBytes_Size(*res);
8159 Py_ssize_t requiredsize;
8160 repsize = PyBytes_Size(repunicode);
8161 requiredsize = *respos + repsize;
8162 if (requiredsize > outsize)
8163 /* Make room for all additional bytes. */
8164 if (charmapencode_resize(res, respos, requiredsize)) {
8165 Py_DECREF(repunicode);
8166 return -1;
8167 }
8168 memcpy(PyBytes_AsString(*res) + *respos,
8169 PyBytes_AsString(repunicode), repsize);
8170 *respos += repsize;
8171 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008172 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008173 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008174 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008175 /* generate replacement */
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008176 if (PyUnicode_READY(repunicode) < 0) {
8177 Py_DECREF(repunicode);
8178 return -1;
8179 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008180 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008181 data = PyUnicode_DATA(repunicode);
8182 kind = PyUnicode_KIND(repunicode);
8183 for (index = 0; index < repsize; index++) {
8184 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8185 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008186 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008187 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008188 return -1;
8189 }
8190 else if (x==enc_FAILED) {
8191 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008192 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008193 return -1;
8194 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008195 }
8196 *inpos = newpos;
8197 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008198 }
8199 return 0;
8200}
8201
Alexander Belopolsky40018472011-02-26 01:02:56 +00008202PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008203_PyUnicode_EncodeCharmap(PyObject *unicode,
8204 PyObject *mapping,
8205 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008206{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008207 /* output object */
8208 PyObject *res = NULL;
8209 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008210 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008211 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008212 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008213 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008214 PyObject *errorHandler = NULL;
8215 PyObject *exc = NULL;
8216 /* the following variable is used for caching string comparisons
8217 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8218 * 3=ignore, 4=xmlcharrefreplace */
8219 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008220
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008221 if (PyUnicode_READY(unicode) < 0)
8222 return NULL;
8223 size = PyUnicode_GET_LENGTH(unicode);
8224
Guido van Rossumd57fd912000-03-10 22:53:23 +00008225 /* Default to Latin-1 */
8226 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008227 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008228
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008229 /* allocate enough for a simple encoding without
8230 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008231 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008232 if (res == NULL)
8233 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008234 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008235 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008236
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008237 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008238 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008239 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008240 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008241 if (x==enc_EXCEPTION) /* error */
8242 goto onError;
8243 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008244 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008245 &exc,
8246 &known_errorHandler, &errorHandler, errors,
8247 &res, &respos)) {
8248 goto onError;
8249 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008250 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008251 else
8252 /* done with this character => adjust input position */
8253 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008254 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008255
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008256 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008257 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008258 if (_PyBytes_Resize(&res, respos) < 0)
8259 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008260
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008261 Py_XDECREF(exc);
8262 Py_XDECREF(errorHandler);
8263 return res;
8264
Benjamin Peterson29060642009-01-31 22:14:21 +00008265 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008266 Py_XDECREF(res);
8267 Py_XDECREF(exc);
8268 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008269 return NULL;
8270}
8271
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008272/* Deprecated */
8273PyObject *
8274PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8275 Py_ssize_t size,
8276 PyObject *mapping,
8277 const char *errors)
8278{
8279 PyObject *result;
8280 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8281 if (unicode == NULL)
8282 return NULL;
8283 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8284 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008285 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008286}
8287
Alexander Belopolsky40018472011-02-26 01:02:56 +00008288PyObject *
8289PyUnicode_AsCharmapString(PyObject *unicode,
8290 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008291{
8292 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008293 PyErr_BadArgument();
8294 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008295 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008296 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008297}
8298
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008299/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008300static void
8301make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008302 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008303 Py_ssize_t startpos, Py_ssize_t endpos,
8304 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008305{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008306 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008307 *exceptionObject = _PyUnicodeTranslateError_Create(
8308 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008309 }
8310 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008311 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8312 goto onError;
8313 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8314 goto onError;
8315 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8316 goto onError;
8317 return;
8318 onError:
8319 Py_DECREF(*exceptionObject);
8320 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008321 }
8322}
8323
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008324/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008325static void
8326raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008327 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008328 Py_ssize_t startpos, Py_ssize_t endpos,
8329 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008330{
8331 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008332 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008333 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008334 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008335}
8336
8337/* error handling callback helper:
8338 build arguments, call the callback and check the arguments,
8339 put the result into newpos and return the replacement string, which
8340 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008341static PyObject *
8342unicode_translate_call_errorhandler(const char *errors,
8343 PyObject **errorHandler,
8344 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008345 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008346 Py_ssize_t startpos, Py_ssize_t endpos,
8347 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008348{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008349 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008350
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008351 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008352 PyObject *restuple;
8353 PyObject *resunicode;
8354
8355 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008356 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008357 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008358 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008359 }
8360
8361 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008362 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008363 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008364 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008365
8366 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008367 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008368 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008369 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008370 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008371 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008372 Py_DECREF(restuple);
8373 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008374 }
8375 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008376 &resunicode, &i_newpos)) {
8377 Py_DECREF(restuple);
8378 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008379 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008380 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008381 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008382 else
8383 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008384 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008385 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8386 Py_DECREF(restuple);
8387 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008388 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008389 Py_INCREF(resunicode);
8390 Py_DECREF(restuple);
8391 return resunicode;
8392}
8393
8394/* Lookup the character ch in the mapping and put the result in result,
8395 which must be decrefed by the caller.
8396 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008397static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008398charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008399{
Christian Heimes217cfd12007-12-02 14:31:20 +00008400 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008401 PyObject *x;
8402
8403 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008404 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008405 x = PyObject_GetItem(mapping, w);
8406 Py_DECREF(w);
8407 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008408 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8409 /* No mapping found means: use 1:1 mapping. */
8410 PyErr_Clear();
8411 *result = NULL;
8412 return 0;
8413 } else
8414 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008415 }
8416 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008417 *result = x;
8418 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008419 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008420 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008421 long value = PyLong_AS_LONG(x);
8422 long max = PyUnicode_GetMax();
8423 if (value < 0 || value > max) {
8424 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008425 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008426 Py_DECREF(x);
8427 return -1;
8428 }
8429 *result = x;
8430 return 0;
8431 }
8432 else if (PyUnicode_Check(x)) {
8433 *result = x;
8434 return 0;
8435 }
8436 else {
8437 /* wrong return value */
8438 PyErr_SetString(PyExc_TypeError,
8439 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008440 Py_DECREF(x);
8441 return -1;
8442 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008443}
8444/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008445 if not reallocate and adjust various state variables.
8446 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008447static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008448charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008449 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008450{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008451 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00008452 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008453 /* exponentially overallocate to minimize reallocations */
8454 if (requiredsize < 2 * oldsize)
8455 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008456 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8457 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008458 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008459 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008460 }
8461 return 0;
8462}
8463/* lookup the character, put the result in the output string and adjust
8464 various state variables. Return a new reference to the object that
8465 was put in the output buffer in *result, or Py_None, if the mapping was
8466 undefined (in which case no character was written).
8467 The called must decref result.
8468 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008469static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008470charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8471 PyObject *mapping, Py_UCS4 **output,
8472 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008473 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008474{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008475 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8476 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008477 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008478 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008479 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008480 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008481 }
8482 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008483 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008484 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008485 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008486 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008487 }
8488 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008489 Py_ssize_t repsize;
8490 if (PyUnicode_READY(*res) == -1)
8491 return -1;
8492 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008493 if (repsize==1) {
8494 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008495 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008496 }
8497 else if (repsize!=0) {
8498 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008499 Py_ssize_t requiredsize = *opos +
8500 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008501 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008502 Py_ssize_t i;
8503 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008504 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008505 for(i = 0; i < repsize; i++)
8506 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008507 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008508 }
8509 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008510 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008511 return 0;
8512}
8513
Alexander Belopolsky40018472011-02-26 01:02:56 +00008514PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008515_PyUnicode_TranslateCharmap(PyObject *input,
8516 PyObject *mapping,
8517 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008518{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008519 /* input object */
8520 char *idata;
8521 Py_ssize_t size, i;
8522 int kind;
8523 /* output buffer */
8524 Py_UCS4 *output = NULL;
8525 Py_ssize_t osize;
8526 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008527 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008528 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008529 char *reason = "character maps to <undefined>";
8530 PyObject *errorHandler = NULL;
8531 PyObject *exc = NULL;
8532 /* the following variable is used for caching string comparisons
8533 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8534 * 3=ignore, 4=xmlcharrefreplace */
8535 int known_errorHandler = -1;
8536
Guido van Rossumd57fd912000-03-10 22:53:23 +00008537 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008538 PyErr_BadArgument();
8539 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008540 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008541
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008542 if (PyUnicode_READY(input) == -1)
8543 return NULL;
8544 idata = (char*)PyUnicode_DATA(input);
8545 kind = PyUnicode_KIND(input);
8546 size = PyUnicode_GET_LENGTH(input);
8547 i = 0;
8548
8549 if (size == 0) {
8550 Py_INCREF(input);
8551 return input;
8552 }
8553
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008554 /* allocate enough for a simple 1:1 translation without
8555 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008556 osize = size;
8557 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8558 opos = 0;
8559 if (output == NULL) {
8560 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008561 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008562 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008563
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008564 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008565 /* try to encode it */
8566 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008567 if (charmaptranslate_output(input, i, mapping,
8568 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008569 Py_XDECREF(x);
8570 goto onError;
8571 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008572 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008573 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008574 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008575 else { /* untranslatable character */
8576 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8577 Py_ssize_t repsize;
8578 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008579 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008580 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008581 Py_ssize_t collstart = i;
8582 Py_ssize_t collend = i+1;
8583 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008584
Benjamin Peterson29060642009-01-31 22:14:21 +00008585 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008586 while (collend < size) {
8587 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008588 goto onError;
8589 Py_XDECREF(x);
8590 if (x!=Py_None)
8591 break;
8592 ++collend;
8593 }
8594 /* cache callback name lookup
8595 * (if not done yet, i.e. it's the first error) */
8596 if (known_errorHandler==-1) {
8597 if ((errors==NULL) || (!strcmp(errors, "strict")))
8598 known_errorHandler = 1;
8599 else if (!strcmp(errors, "replace"))
8600 known_errorHandler = 2;
8601 else if (!strcmp(errors, "ignore"))
8602 known_errorHandler = 3;
8603 else if (!strcmp(errors, "xmlcharrefreplace"))
8604 known_errorHandler = 4;
8605 else
8606 known_errorHandler = 0;
8607 }
8608 switch (known_errorHandler) {
8609 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008610 raise_translate_exception(&exc, input, collstart,
8611 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008612 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008613 case 2: /* replace */
8614 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008615 for (coll = collstart; coll<collend; coll++)
8616 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008617 /* fall through */
8618 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008619 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008620 break;
8621 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008622 /* generate replacement (temporarily (mis)uses i) */
8623 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008624 char buffer[2+29+1+1];
8625 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008626 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8627 if (charmaptranslate_makespace(&output, &osize,
8628 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008629 goto onError;
8630 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008631 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008632 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008633 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008634 break;
8635 default:
8636 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008637 reason, input, &exc,
8638 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008639 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008640 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008641 if (PyUnicode_READY(repunicode) < 0) {
8642 Py_DECREF(repunicode);
8643 goto onError;
8644 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008645 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008646 repsize = PyUnicode_GET_LENGTH(repunicode);
8647 if (charmaptranslate_makespace(&output, &osize,
8648 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008649 Py_DECREF(repunicode);
8650 goto onError;
8651 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008652 for (uni2 = 0; repsize-->0; ++uni2)
8653 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8654 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008655 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008656 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008657 }
8658 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008659 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8660 if (!res)
8661 goto onError;
8662 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008663 Py_XDECREF(exc);
8664 Py_XDECREF(errorHandler);
8665 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008666
Benjamin Peterson29060642009-01-31 22:14:21 +00008667 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008668 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008669 Py_XDECREF(exc);
8670 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008671 return NULL;
8672}
8673
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008674/* Deprecated. Use PyUnicode_Translate instead. */
8675PyObject *
8676PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8677 Py_ssize_t size,
8678 PyObject *mapping,
8679 const char *errors)
8680{
8681 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8682 if (!unicode)
8683 return NULL;
8684 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8685}
8686
Alexander Belopolsky40018472011-02-26 01:02:56 +00008687PyObject *
8688PyUnicode_Translate(PyObject *str,
8689 PyObject *mapping,
8690 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008691{
8692 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008693
Guido van Rossumd57fd912000-03-10 22:53:23 +00008694 str = PyUnicode_FromObject(str);
8695 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008696 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008697 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008698 Py_DECREF(str);
8699 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008700
Benjamin Peterson29060642009-01-31 22:14:21 +00008701 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008702 Py_XDECREF(str);
8703 return NULL;
8704}
Tim Petersced69f82003-09-16 20:30:58 +00008705
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008706static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008707fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008708{
8709 /* No need to call PyUnicode_READY(self) because this function is only
8710 called as a callback from fixup() which does it already. */
8711 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8712 const int kind = PyUnicode_KIND(self);
8713 void *data = PyUnicode_DATA(self);
8714 Py_UCS4 maxchar = 0, ch, fixed;
8715 Py_ssize_t i;
8716
8717 for (i = 0; i < len; ++i) {
8718 ch = PyUnicode_READ(kind, data, i);
8719 fixed = 0;
8720 if (ch > 127) {
8721 if (Py_UNICODE_ISSPACE(ch))
8722 fixed = ' ';
8723 else {
8724 const int decimal = Py_UNICODE_TODECIMAL(ch);
8725 if (decimal >= 0)
8726 fixed = '0' + decimal;
8727 }
8728 if (fixed != 0) {
8729 if (fixed > maxchar)
8730 maxchar = fixed;
8731 PyUnicode_WRITE(kind, data, i, fixed);
8732 }
8733 else if (ch > maxchar)
8734 maxchar = ch;
8735 }
8736 else if (ch > maxchar)
8737 maxchar = ch;
8738 }
8739
8740 return maxchar;
8741}
8742
8743PyObject *
8744_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8745{
8746 if (!PyUnicode_Check(unicode)) {
8747 PyErr_BadInternalCall();
8748 return NULL;
8749 }
8750 if (PyUnicode_READY(unicode) == -1)
8751 return NULL;
8752 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8753 /* If the string is already ASCII, just return the same string */
8754 Py_INCREF(unicode);
8755 return unicode;
8756 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008757 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008758}
8759
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008760PyObject *
8761PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8762 Py_ssize_t length)
8763{
Victor Stinnerf0124502011-11-21 23:12:56 +01008764 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008765 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008766 Py_UCS4 maxchar;
8767 enum PyUnicode_Kind kind;
8768 void *data;
8769
8770 maxchar = 0;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008771 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008772 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008773 if (ch > 127) {
8774 int decimal = Py_UNICODE_TODECIMAL(ch);
8775 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008776 ch = '0' + decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008777 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008778 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008779 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008780
8781 /* Copy to a new string */
8782 decimal = PyUnicode_New(length, maxchar);
8783 if (decimal == NULL)
8784 return decimal;
8785 kind = PyUnicode_KIND(decimal);
8786 data = PyUnicode_DATA(decimal);
8787 /* Iterate over code points */
8788 for (i = 0; i < length; i++) {
8789 Py_UNICODE ch = s[i];
8790 if (ch > 127) {
8791 int decimal = Py_UNICODE_TODECIMAL(ch);
8792 if (decimal >= 0)
8793 ch = '0' + decimal;
8794 }
8795 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008796 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008797 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008798}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008799/* --- Decimal Encoder ---------------------------------------------------- */
8800
Alexander Belopolsky40018472011-02-26 01:02:56 +00008801int
8802PyUnicode_EncodeDecimal(Py_UNICODE *s,
8803 Py_ssize_t length,
8804 char *output,
8805 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008806{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008807 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008808 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008809 enum PyUnicode_Kind kind;
8810 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008811
8812 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008813 PyErr_BadArgument();
8814 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008815 }
8816
Victor Stinner42bf7752011-11-21 22:52:58 +01008817 unicode = PyUnicode_FromUnicode(s, length);
8818 if (unicode == NULL)
8819 return -1;
8820
Victor Stinner6345be92011-11-25 20:09:01 +01008821 if (PyUnicode_READY(unicode) < 0) {
8822 Py_DECREF(unicode);
8823 return -1;
8824 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008825 kind = PyUnicode_KIND(unicode);
8826 data = PyUnicode_DATA(unicode);
8827
Victor Stinnerb84d7232011-11-22 01:50:07 +01008828 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008829 PyObject *exc;
8830 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008831 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008832 Py_ssize_t startpos;
8833
8834 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008835
Benjamin Peterson29060642009-01-31 22:14:21 +00008836 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008837 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008838 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008839 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008840 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008841 decimal = Py_UNICODE_TODECIMAL(ch);
8842 if (decimal >= 0) {
8843 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008844 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008845 continue;
8846 }
8847 if (0 < ch && ch < 256) {
8848 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008849 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008850 continue;
8851 }
Victor Stinner6345be92011-11-25 20:09:01 +01008852
Victor Stinner42bf7752011-11-21 22:52:58 +01008853 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008854 exc = NULL;
8855 raise_encode_exception(&exc, "decimal", unicode,
8856 startpos, startpos+1,
8857 "invalid decimal Unicode string");
8858 Py_XDECREF(exc);
8859 Py_DECREF(unicode);
8860 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008861 }
8862 /* 0-terminate the output string */
8863 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008864 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008865 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008866}
8867
Guido van Rossumd57fd912000-03-10 22:53:23 +00008868/* --- Helpers ------------------------------------------------------------ */
8869
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008870static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008871any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008872 Py_ssize_t start,
8873 Py_ssize_t end)
8874{
8875 int kind1, kind2, kind;
8876 void *buf1, *buf2;
8877 Py_ssize_t len1, len2, result;
8878
8879 kind1 = PyUnicode_KIND(s1);
8880 kind2 = PyUnicode_KIND(s2);
8881 kind = kind1 > kind2 ? kind1 : kind2;
8882 buf1 = PyUnicode_DATA(s1);
8883 buf2 = PyUnicode_DATA(s2);
8884 if (kind1 != kind)
8885 buf1 = _PyUnicode_AsKind(s1, kind);
8886 if (!buf1)
8887 return -2;
8888 if (kind2 != kind)
8889 buf2 = _PyUnicode_AsKind(s2, kind);
8890 if (!buf2) {
8891 if (kind1 != kind) PyMem_Free(buf1);
8892 return -2;
8893 }
8894 len1 = PyUnicode_GET_LENGTH(s1);
8895 len2 = PyUnicode_GET_LENGTH(s2);
8896
Victor Stinner794d5672011-10-10 03:21:36 +02008897 if (direction > 0) {
8898 switch(kind) {
8899 case PyUnicode_1BYTE_KIND:
8900 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8901 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8902 else
8903 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8904 break;
8905 case PyUnicode_2BYTE_KIND:
8906 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8907 break;
8908 case PyUnicode_4BYTE_KIND:
8909 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8910 break;
8911 default:
8912 assert(0); result = -2;
8913 }
8914 }
8915 else {
8916 switch(kind) {
8917 case PyUnicode_1BYTE_KIND:
8918 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8919 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8920 else
8921 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8922 break;
8923 case PyUnicode_2BYTE_KIND:
8924 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8925 break;
8926 case PyUnicode_4BYTE_KIND:
8927 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8928 break;
8929 default:
8930 assert(0); result = -2;
8931 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008932 }
8933
8934 if (kind1 != kind)
8935 PyMem_Free(buf1);
8936 if (kind2 != kind)
8937 PyMem_Free(buf2);
8938
8939 return result;
8940}
8941
8942Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02008943_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008944 Py_ssize_t n_buffer,
8945 void *digits, Py_ssize_t n_digits,
8946 Py_ssize_t min_width,
8947 const char *grouping,
8948 const char *thousands_sep)
8949{
8950 switch(kind) {
8951 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008952 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
8953 return _PyUnicode_ascii_InsertThousandsGrouping(
8954 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8955 min_width, grouping, thousands_sep);
8956 else
8957 return _PyUnicode_ucs1_InsertThousandsGrouping(
8958 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8959 min_width, grouping, thousands_sep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008960 case PyUnicode_2BYTE_KIND:
8961 return _PyUnicode_ucs2_InsertThousandsGrouping(
8962 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8963 min_width, grouping, thousands_sep);
8964 case PyUnicode_4BYTE_KIND:
8965 return _PyUnicode_ucs4_InsertThousandsGrouping(
8966 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
8967 min_width, grouping, thousands_sep);
8968 }
8969 assert(0);
8970 return -1;
8971}
8972
8973
Thomas Wouters477c8d52006-05-27 19:21:47 +00008974/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008975#define ADJUST_INDICES(start, end, len) \
8976 if (end > len) \
8977 end = len; \
8978 else if (end < 0) { \
8979 end += len; \
8980 if (end < 0) \
8981 end = 0; \
8982 } \
8983 if (start < 0) { \
8984 start += len; \
8985 if (start < 0) \
8986 start = 0; \
8987 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008988
Alexander Belopolsky40018472011-02-26 01:02:56 +00008989Py_ssize_t
8990PyUnicode_Count(PyObject *str,
8991 PyObject *substr,
8992 Py_ssize_t start,
8993 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008994{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008995 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008996 PyObject* str_obj;
8997 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008998 int kind1, kind2, kind;
8999 void *buf1 = NULL, *buf2 = NULL;
9000 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009001
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009002 str_obj = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009003 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009004 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009005 sub_obj = PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02009006 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009007 Py_DECREF(str_obj);
9008 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009009 }
Tim Petersced69f82003-09-16 20:30:58 +00009010
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009011 kind1 = PyUnicode_KIND(str_obj);
9012 kind2 = PyUnicode_KIND(sub_obj);
9013 kind = kind1 > kind2 ? kind1 : kind2;
9014 buf1 = PyUnicode_DATA(str_obj);
9015 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009016 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009017 if (!buf1)
9018 goto onError;
9019 buf2 = PyUnicode_DATA(sub_obj);
9020 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009021 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009022 if (!buf2)
9023 goto onError;
9024 len1 = PyUnicode_GET_LENGTH(str_obj);
9025 len2 = PyUnicode_GET_LENGTH(sub_obj);
9026
9027 ADJUST_INDICES(start, end, len1);
9028 switch(kind) {
9029 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009030 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9031 result = asciilib_count(
9032 ((Py_UCS1*)buf1) + start, end - start,
9033 buf2, len2, PY_SSIZE_T_MAX
9034 );
9035 else
9036 result = ucs1lib_count(
9037 ((Py_UCS1*)buf1) + start, end - start,
9038 buf2, len2, PY_SSIZE_T_MAX
9039 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009040 break;
9041 case PyUnicode_2BYTE_KIND:
9042 result = ucs2lib_count(
9043 ((Py_UCS2*)buf1) + start, end - start,
9044 buf2, len2, PY_SSIZE_T_MAX
9045 );
9046 break;
9047 case PyUnicode_4BYTE_KIND:
9048 result = ucs4lib_count(
9049 ((Py_UCS4*)buf1) + start, end - start,
9050 buf2, len2, PY_SSIZE_T_MAX
9051 );
9052 break;
9053 default:
9054 assert(0); result = 0;
9055 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009056
9057 Py_DECREF(sub_obj);
9058 Py_DECREF(str_obj);
9059
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009060 if (kind1 != kind)
9061 PyMem_Free(buf1);
9062 if (kind2 != kind)
9063 PyMem_Free(buf2);
9064
Guido van Rossumd57fd912000-03-10 22:53:23 +00009065 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009066 onError:
9067 Py_DECREF(sub_obj);
9068 Py_DECREF(str_obj);
9069 if (kind1 != kind && buf1)
9070 PyMem_Free(buf1);
9071 if (kind2 != kind && buf2)
9072 PyMem_Free(buf2);
9073 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009074}
9075
Alexander Belopolsky40018472011-02-26 01:02:56 +00009076Py_ssize_t
9077PyUnicode_Find(PyObject *str,
9078 PyObject *sub,
9079 Py_ssize_t start,
9080 Py_ssize_t end,
9081 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009082{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009083 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009084
Guido van Rossumd57fd912000-03-10 22:53:23 +00009085 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009086 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009087 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009088 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009089 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009090 Py_DECREF(str);
9091 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009092 }
Tim Petersced69f82003-09-16 20:30:58 +00009093
Victor Stinner794d5672011-10-10 03:21:36 +02009094 result = any_find_slice(direction,
9095 str, sub, start, end
9096 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009097
Guido van Rossumd57fd912000-03-10 22:53:23 +00009098 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009099 Py_DECREF(sub);
9100
Guido van Rossumd57fd912000-03-10 22:53:23 +00009101 return result;
9102}
9103
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009104Py_ssize_t
9105PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9106 Py_ssize_t start, Py_ssize_t end,
9107 int direction)
9108{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009109 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009110 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009111 if (PyUnicode_READY(str) == -1)
9112 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009113 if (start < 0 || end < 0) {
9114 PyErr_SetString(PyExc_IndexError, "string index out of range");
9115 return -2;
9116 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009117 if (end > PyUnicode_GET_LENGTH(str))
9118 end = PyUnicode_GET_LENGTH(str);
9119 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009120 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9121 kind, end-start, ch, direction);
9122 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009123 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009124 else
9125 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009126}
9127
Alexander Belopolsky40018472011-02-26 01:02:56 +00009128static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009129tailmatch(PyObject *self,
9130 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009131 Py_ssize_t start,
9132 Py_ssize_t end,
9133 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009134{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009135 int kind_self;
9136 int kind_sub;
9137 void *data_self;
9138 void *data_sub;
9139 Py_ssize_t offset;
9140 Py_ssize_t i;
9141 Py_ssize_t end_sub;
9142
9143 if (PyUnicode_READY(self) == -1 ||
9144 PyUnicode_READY(substring) == -1)
9145 return 0;
9146
9147 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009148 return 1;
9149
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009150 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9151 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009152 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009153 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009154
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009155 kind_self = PyUnicode_KIND(self);
9156 data_self = PyUnicode_DATA(self);
9157 kind_sub = PyUnicode_KIND(substring);
9158 data_sub = PyUnicode_DATA(substring);
9159 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9160
9161 if (direction > 0)
9162 offset = end;
9163 else
9164 offset = start;
9165
9166 if (PyUnicode_READ(kind_self, data_self, offset) ==
9167 PyUnicode_READ(kind_sub, data_sub, 0) &&
9168 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9169 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9170 /* If both are of the same kind, memcmp is sufficient */
9171 if (kind_self == kind_sub) {
9172 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009173 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009174 data_sub,
9175 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009176 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009177 }
9178 /* otherwise we have to compare each character by first accesing it */
9179 else {
9180 /* We do not need to compare 0 and len(substring)-1 because
9181 the if statement above ensured already that they are equal
9182 when we end up here. */
9183 // TODO: honor direction and do a forward or backwards search
9184 for (i = 1; i < end_sub; ++i) {
9185 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9186 PyUnicode_READ(kind_sub, data_sub, i))
9187 return 0;
9188 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009189 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009190 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009191 }
9192
9193 return 0;
9194}
9195
Alexander Belopolsky40018472011-02-26 01:02:56 +00009196Py_ssize_t
9197PyUnicode_Tailmatch(PyObject *str,
9198 PyObject *substr,
9199 Py_ssize_t start,
9200 Py_ssize_t end,
9201 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009202{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009203 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009204
Guido van Rossumd57fd912000-03-10 22:53:23 +00009205 str = PyUnicode_FromObject(str);
9206 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009207 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009208 substr = PyUnicode_FromObject(substr);
9209 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009210 Py_DECREF(str);
9211 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009212 }
Tim Petersced69f82003-09-16 20:30:58 +00009213
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009214 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009215 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009216 Py_DECREF(str);
9217 Py_DECREF(substr);
9218 return result;
9219}
9220
Guido van Rossumd57fd912000-03-10 22:53:23 +00009221/* Apply fixfct filter to the Unicode object self and return a
9222 reference to the modified object */
9223
Alexander Belopolsky40018472011-02-26 01:02:56 +00009224static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009225fixup(PyObject *self,
9226 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009227{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009228 PyObject *u;
9229 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009230 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009231
Victor Stinner87af4f22011-11-21 23:03:47 +01009232 u = PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009233 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009234 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009235 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009236
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009237 /* fix functions return the new maximum character in a string,
9238 if the kind of the resulting unicode object does not change,
9239 everything is fine. Otherwise we need to change the string kind
9240 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009241 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009242
9243 if (maxchar_new == 0) {
9244 /* no changes */;
9245 if (PyUnicode_CheckExact(self)) {
9246 Py_DECREF(u);
9247 Py_INCREF(self);
9248 return self;
9249 }
9250 else
9251 return u;
9252 }
9253
9254 if (maxchar_new <= 127)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009255 maxchar_new = 127;
9256 else if (maxchar_new <= 255)
9257 maxchar_new = 255;
9258 else if (maxchar_new <= 65535)
9259 maxchar_new = 65535;
9260 else
Victor Stinner8faf8212011-12-08 22:14:11 +01009261 maxchar_new = MAX_UNICODE;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009262
Victor Stinnereaab6042011-12-11 22:22:39 +01009263 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009264 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009265
9266 /* In case the maximum character changed, we need to
9267 convert the string to the new category. */
9268 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9269 if (v == NULL) {
9270 Py_DECREF(u);
9271 return NULL;
9272 }
9273 if (maxchar_new > maxchar_old) {
9274 /* If the maxchar increased so that the kind changed, not all
9275 characters are representable anymore and we need to fix the
9276 string again. This only happens in very few cases. */
9277 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
9278 maxchar_old = fixfct(v);
9279 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009280 }
9281 else {
Victor Stinnereaab6042011-12-11 22:22:39 +01009282 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009283 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009284 Py_DECREF(u);
9285 assert(_PyUnicode_CheckConsistency(v, 1));
9286 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009287}
9288
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009289static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009290fixupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009291{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009292 /* No need to call PyUnicode_READY(self) because this function is only
9293 called as a callback from fixup() which does it already. */
9294 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9295 const int kind = PyUnicode_KIND(self);
9296 void *data = PyUnicode_DATA(self);
9297 int touched = 0;
9298 Py_UCS4 maxchar = 0;
9299 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009300
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009301 for (i = 0; i < len; ++i) {
9302 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9303 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
9304 if (up != ch) {
9305 if (up > maxchar)
9306 maxchar = up;
9307 PyUnicode_WRITE(kind, data, i, up);
9308 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009309 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009310 else if (ch > maxchar)
9311 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009312 }
9313
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009314 if (touched)
9315 return maxchar;
9316 else
9317 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009318}
9319
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009320static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009321fixlower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009322{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009323 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9324 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9325 const int kind = PyUnicode_KIND(self);
9326 void *data = PyUnicode_DATA(self);
9327 int touched = 0;
9328 Py_UCS4 maxchar = 0;
9329 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009330
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009331 for(i = 0; i < len; ++i) {
9332 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9333 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9334 if (lo != ch) {
9335 if (lo > maxchar)
9336 maxchar = lo;
9337 PyUnicode_WRITE(kind, data, i, lo);
9338 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009339 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009340 else if (ch > maxchar)
9341 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009342 }
9343
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009344 if (touched)
9345 return maxchar;
9346 else
9347 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009348}
9349
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009350static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009351fixswapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009352{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009353 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9354 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9355 const int kind = PyUnicode_KIND(self);
9356 void *data = PyUnicode_DATA(self);
9357 int touched = 0;
9358 Py_UCS4 maxchar = 0;
9359 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009360
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009361 for(i = 0; i < len; ++i) {
9362 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9363 Py_UCS4 nu = 0;
9364
9365 if (Py_UNICODE_ISUPPER(ch))
9366 nu = Py_UNICODE_TOLOWER(ch);
9367 else if (Py_UNICODE_ISLOWER(ch))
9368 nu = Py_UNICODE_TOUPPER(ch);
9369
9370 if (nu != 0) {
9371 if (nu > maxchar)
9372 maxchar = nu;
9373 PyUnicode_WRITE(kind, data, i, nu);
9374 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009375 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009376 else if (ch > maxchar)
9377 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009378 }
9379
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009380 if (touched)
9381 return maxchar;
9382 else
9383 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009384}
9385
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009386static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009387fixcapitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009388{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009389 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9390 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9391 const int kind = PyUnicode_KIND(self);
9392 void *data = PyUnicode_DATA(self);
9393 int touched = 0;
9394 Py_UCS4 maxchar = 0;
9395 Py_ssize_t i = 0;
9396 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00009397
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009398 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009399 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009400
9401 ch = PyUnicode_READ(kind, data, i);
9402 if (!Py_UNICODE_ISUPPER(ch)) {
9403 maxchar = Py_UNICODE_TOUPPER(ch);
9404 PyUnicode_WRITE(kind, data, i, maxchar);
9405 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009406 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009407 ++i;
9408 for(; i < len; ++i) {
9409 ch = PyUnicode_READ(kind, data, i);
9410 if (!Py_UNICODE_ISLOWER(ch)) {
9411 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9412 if (lo > maxchar)
9413 maxchar = lo;
9414 PyUnicode_WRITE(kind, data, i, lo);
9415 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009416 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009417 else if (ch > maxchar)
9418 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009419 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009420
9421 if (touched)
9422 return maxchar;
9423 else
9424 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009425}
9426
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009427static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009428fixtitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009429{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009430 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9431 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9432 const int kind = PyUnicode_KIND(self);
9433 void *data = PyUnicode_DATA(self);
9434 Py_UCS4 maxchar = 0;
9435 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009436 int previous_is_cased;
9437
9438 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009439 if (len == 1) {
9440 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9441 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
9442 if (ti != ch) {
9443 PyUnicode_WRITE(kind, data, i, ti);
9444 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00009445 }
9446 else
9447 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009448 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009449 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009450 for(; i < len; ++i) {
9451 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9452 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00009453
Benjamin Peterson29060642009-01-31 22:14:21 +00009454 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009455 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00009456 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009457 nu = Py_UNICODE_TOTITLE(ch);
9458
9459 if (nu > maxchar)
9460 maxchar = nu;
9461 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00009462
Benjamin Peterson29060642009-01-31 22:14:21 +00009463 if (Py_UNICODE_ISLOWER(ch) ||
9464 Py_UNICODE_ISUPPER(ch) ||
9465 Py_UNICODE_ISTITLE(ch))
9466 previous_is_cased = 1;
9467 else
9468 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009469 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009470 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009471}
9472
Tim Peters8ce9f162004-08-27 01:49:32 +00009473PyObject *
9474PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009475{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009476 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009477 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009478 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009479 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009480 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9481 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009482 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009483 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009484 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009485 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009486 int use_memcpy;
9487 unsigned char *res_data = NULL, *sep_data = NULL;
9488 PyObject *last_obj;
9489 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009490
Tim Peters05eba1f2004-08-27 21:32:02 +00009491 fseq = PySequence_Fast(seq, "");
9492 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009493 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009494 }
9495
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009496 /* NOTE: the following code can't call back into Python code,
9497 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009498 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009499
Tim Peters05eba1f2004-08-27 21:32:02 +00009500 seqlen = PySequence_Fast_GET_SIZE(fseq);
9501 /* If empty sequence, return u"". */
9502 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009503 Py_DECREF(fseq);
9504 Py_INCREF(unicode_empty);
9505 res = unicode_empty;
9506 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009507 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009508
Tim Peters05eba1f2004-08-27 21:32:02 +00009509 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009510 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009511 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009512 if (seqlen == 1) {
9513 if (PyUnicode_CheckExact(items[0])) {
9514 res = items[0];
9515 Py_INCREF(res);
9516 Py_DECREF(fseq);
9517 return res;
9518 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009519 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009520 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009521 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009522 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009523 /* Set up sep and seplen */
9524 if (separator == NULL) {
9525 /* fall back to a blank space separator */
9526 sep = PyUnicode_FromOrdinal(' ');
9527 if (!sep)
9528 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009529 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009530 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009531 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009532 else {
9533 if (!PyUnicode_Check(separator)) {
9534 PyErr_Format(PyExc_TypeError,
9535 "separator: expected str instance,"
9536 " %.80s found",
9537 Py_TYPE(separator)->tp_name);
9538 goto onError;
9539 }
9540 if (PyUnicode_READY(separator))
9541 goto onError;
9542 sep = separator;
9543 seplen = PyUnicode_GET_LENGTH(separator);
9544 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9545 /* inc refcount to keep this code path symmetric with the
9546 above case of a blank separator */
9547 Py_INCREF(sep);
9548 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009549 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009550 }
9551
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009552 /* There are at least two things to join, or else we have a subclass
9553 * of str in the sequence.
9554 * Do a pre-pass to figure out the total amount of space we'll
9555 * need (sz), and see whether all argument are strings.
9556 */
9557 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009558#ifdef Py_DEBUG
9559 use_memcpy = 0;
9560#else
9561 use_memcpy = 1;
9562#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009563 for (i = 0; i < seqlen; i++) {
9564 const Py_ssize_t old_sz = sz;
9565 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009566 if (!PyUnicode_Check(item)) {
9567 PyErr_Format(PyExc_TypeError,
9568 "sequence item %zd: expected str instance,"
9569 " %.80s found",
9570 i, Py_TYPE(item)->tp_name);
9571 goto onError;
9572 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009573 if (PyUnicode_READY(item) == -1)
9574 goto onError;
9575 sz += PyUnicode_GET_LENGTH(item);
9576 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009577 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009578 if (i != 0)
9579 sz += seplen;
9580 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9581 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009582 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009583 goto onError;
9584 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009585 if (use_memcpy && last_obj != NULL) {
9586 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9587 use_memcpy = 0;
9588 }
9589 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009590 }
Tim Petersced69f82003-09-16 20:30:58 +00009591
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009592 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009593 if (res == NULL)
9594 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009595
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009596 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009597#ifdef Py_DEBUG
9598 use_memcpy = 0;
9599#else
9600 if (use_memcpy) {
9601 res_data = PyUnicode_1BYTE_DATA(res);
9602 kind = PyUnicode_KIND(res);
9603 if (seplen != 0)
9604 sep_data = PyUnicode_1BYTE_DATA(sep);
9605 }
9606#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009607 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009608 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009609 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009610 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009611 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009612 if (use_memcpy) {
9613 Py_MEMCPY(res_data,
9614 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009615 kind * seplen);
9616 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009617 }
9618 else {
9619 copy_characters(res, res_offset, sep, 0, seplen);
9620 res_offset += seplen;
9621 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009622 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009623 itemlen = PyUnicode_GET_LENGTH(item);
9624 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009625 if (use_memcpy) {
9626 Py_MEMCPY(res_data,
9627 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009628 kind * itemlen);
9629 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009630 }
9631 else {
9632 copy_characters(res, res_offset, item, 0, itemlen);
9633 res_offset += itemlen;
9634 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009635 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009636 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009637 if (use_memcpy)
9638 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009639 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009640 else
9641 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009642
Tim Peters05eba1f2004-08-27 21:32:02 +00009643 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009644 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009645 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009646 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009647
Benjamin Peterson29060642009-01-31 22:14:21 +00009648 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009649 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009650 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009651 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009652 return NULL;
9653}
9654
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009655#define FILL(kind, data, value, start, length) \
9656 do { \
9657 Py_ssize_t i_ = 0; \
9658 assert(kind != PyUnicode_WCHAR_KIND); \
9659 switch ((kind)) { \
9660 case PyUnicode_1BYTE_KIND: { \
9661 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9662 memset(to_, (unsigned char)value, length); \
9663 break; \
9664 } \
9665 case PyUnicode_2BYTE_KIND: { \
9666 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9667 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9668 break; \
9669 } \
9670 default: { \
9671 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9672 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9673 break; \
9674 } \
9675 } \
9676 } while (0)
9677
Victor Stinner9310abb2011-10-05 00:59:23 +02009678static PyObject *
9679pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009680 Py_ssize_t left,
9681 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009682 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009683{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009684 PyObject *u;
9685 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009686 int kind;
9687 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009688
9689 if (left < 0)
9690 left = 0;
9691 if (right < 0)
9692 right = 0;
9693
Victor Stinnerc4b49542011-12-11 22:44:26 +01009694 if (left == 0 && right == 0)
9695 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009696
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009697 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9698 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009699 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9700 return NULL;
9701 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009702 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9703 if (fill > maxchar)
9704 maxchar = fill;
9705 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009706 if (!u)
9707 return NULL;
9708
9709 kind = PyUnicode_KIND(u);
9710 data = PyUnicode_DATA(u);
9711 if (left)
9712 FILL(kind, data, fill, 0, left);
9713 if (right)
9714 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009715 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009716 assert(_PyUnicode_CheckConsistency(u, 1));
9717 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009718}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009719#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009720
Alexander Belopolsky40018472011-02-26 01:02:56 +00009721PyObject *
9722PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009723{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009724 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009725
9726 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009727 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009728 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009729
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009730 switch(PyUnicode_KIND(string)) {
9731 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009732 if (PyUnicode_IS_ASCII(string))
9733 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009734 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009735 PyUnicode_GET_LENGTH(string), keepends);
9736 else
9737 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009738 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009739 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009740 break;
9741 case PyUnicode_2BYTE_KIND:
9742 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009743 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009744 PyUnicode_GET_LENGTH(string), keepends);
9745 break;
9746 case PyUnicode_4BYTE_KIND:
9747 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009748 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009749 PyUnicode_GET_LENGTH(string), keepends);
9750 break;
9751 default:
9752 assert(0);
9753 list = 0;
9754 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009755 Py_DECREF(string);
9756 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009757}
9758
Alexander Belopolsky40018472011-02-26 01:02:56 +00009759static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009760split(PyObject *self,
9761 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009762 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009763{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009764 int kind1, kind2, kind;
9765 void *buf1, *buf2;
9766 Py_ssize_t len1, len2;
9767 PyObject* out;
9768
Guido van Rossumd57fd912000-03-10 22:53:23 +00009769 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009770 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009771
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009772 if (PyUnicode_READY(self) == -1)
9773 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009774
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009775 if (substring == NULL)
9776 switch(PyUnicode_KIND(self)) {
9777 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009778 if (PyUnicode_IS_ASCII(self))
9779 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009780 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009781 PyUnicode_GET_LENGTH(self), maxcount
9782 );
9783 else
9784 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009785 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009786 PyUnicode_GET_LENGTH(self), maxcount
9787 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009788 case PyUnicode_2BYTE_KIND:
9789 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009790 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009791 PyUnicode_GET_LENGTH(self), maxcount
9792 );
9793 case PyUnicode_4BYTE_KIND:
9794 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009795 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009796 PyUnicode_GET_LENGTH(self), maxcount
9797 );
9798 default:
9799 assert(0);
9800 return NULL;
9801 }
9802
9803 if (PyUnicode_READY(substring) == -1)
9804 return NULL;
9805
9806 kind1 = PyUnicode_KIND(self);
9807 kind2 = PyUnicode_KIND(substring);
9808 kind = kind1 > kind2 ? kind1 : kind2;
9809 buf1 = PyUnicode_DATA(self);
9810 buf2 = PyUnicode_DATA(substring);
9811 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009812 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009813 if (!buf1)
9814 return NULL;
9815 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009816 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009817 if (!buf2) {
9818 if (kind1 != kind) PyMem_Free(buf1);
9819 return NULL;
9820 }
9821 len1 = PyUnicode_GET_LENGTH(self);
9822 len2 = PyUnicode_GET_LENGTH(substring);
9823
9824 switch(kind) {
9825 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009826 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9827 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009828 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009829 else
9830 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009831 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009832 break;
9833 case PyUnicode_2BYTE_KIND:
9834 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009835 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009836 break;
9837 case PyUnicode_4BYTE_KIND:
9838 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009839 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009840 break;
9841 default:
9842 out = NULL;
9843 }
9844 if (kind1 != kind)
9845 PyMem_Free(buf1);
9846 if (kind2 != kind)
9847 PyMem_Free(buf2);
9848 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009849}
9850
Alexander Belopolsky40018472011-02-26 01:02:56 +00009851static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009852rsplit(PyObject *self,
9853 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009854 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009855{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009856 int kind1, kind2, kind;
9857 void *buf1, *buf2;
9858 Py_ssize_t len1, len2;
9859 PyObject* out;
9860
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009861 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009862 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009863
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009864 if (PyUnicode_READY(self) == -1)
9865 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009866
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009867 if (substring == NULL)
9868 switch(PyUnicode_KIND(self)) {
9869 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009870 if (PyUnicode_IS_ASCII(self))
9871 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009872 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009873 PyUnicode_GET_LENGTH(self), maxcount
9874 );
9875 else
9876 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009877 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009878 PyUnicode_GET_LENGTH(self), maxcount
9879 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009880 case PyUnicode_2BYTE_KIND:
9881 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009882 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009883 PyUnicode_GET_LENGTH(self), maxcount
9884 );
9885 case PyUnicode_4BYTE_KIND:
9886 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009887 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009888 PyUnicode_GET_LENGTH(self), maxcount
9889 );
9890 default:
9891 assert(0);
9892 return NULL;
9893 }
9894
9895 if (PyUnicode_READY(substring) == -1)
9896 return NULL;
9897
9898 kind1 = PyUnicode_KIND(self);
9899 kind2 = PyUnicode_KIND(substring);
9900 kind = kind1 > kind2 ? kind1 : kind2;
9901 buf1 = PyUnicode_DATA(self);
9902 buf2 = PyUnicode_DATA(substring);
9903 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009904 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009905 if (!buf1)
9906 return NULL;
9907 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009908 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009909 if (!buf2) {
9910 if (kind1 != kind) PyMem_Free(buf1);
9911 return NULL;
9912 }
9913 len1 = PyUnicode_GET_LENGTH(self);
9914 len2 = PyUnicode_GET_LENGTH(substring);
9915
9916 switch(kind) {
9917 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009918 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9919 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009920 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009921 else
9922 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009923 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009924 break;
9925 case PyUnicode_2BYTE_KIND:
9926 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009927 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009928 break;
9929 case PyUnicode_4BYTE_KIND:
9930 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009931 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009932 break;
9933 default:
9934 out = NULL;
9935 }
9936 if (kind1 != kind)
9937 PyMem_Free(buf1);
9938 if (kind2 != kind)
9939 PyMem_Free(buf2);
9940 return out;
9941}
9942
9943static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009944anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9945 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009946{
9947 switch(kind) {
9948 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009949 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9950 return asciilib_find(buf1, len1, buf2, len2, offset);
9951 else
9952 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009953 case PyUnicode_2BYTE_KIND:
9954 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9955 case PyUnicode_4BYTE_KIND:
9956 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9957 }
9958 assert(0);
9959 return -1;
9960}
9961
9962static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009963anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
9964 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009965{
9966 switch(kind) {
9967 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009968 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
9969 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
9970 else
9971 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009972 case PyUnicode_2BYTE_KIND:
9973 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9974 case PyUnicode_4BYTE_KIND:
9975 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9976 }
9977 assert(0);
9978 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009979}
9980
Alexander Belopolsky40018472011-02-26 01:02:56 +00009981static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009982replace(PyObject *self, PyObject *str1,
9983 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009984{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009985 PyObject *u;
9986 char *sbuf = PyUnicode_DATA(self);
9987 char *buf1 = PyUnicode_DATA(str1);
9988 char *buf2 = PyUnicode_DATA(str2);
9989 int srelease = 0, release1 = 0, release2 = 0;
9990 int skind = PyUnicode_KIND(self);
9991 int kind1 = PyUnicode_KIND(str1);
9992 int kind2 = PyUnicode_KIND(str2);
9993 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9994 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9995 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +02009996 int mayshrink;
9997 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009998
9999 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010000 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010001 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010002 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010003
Victor Stinner59de0ee2011-10-07 10:01:28 +020010004 if (str1 == str2)
10005 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010006 if (skind < kind1)
10007 /* substring too wide to be present */
10008 goto nothing;
10009
Victor Stinner49a0a212011-10-12 23:46:10 +020010010 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10011 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10012 /* Replacing str1 with str2 may cause a maxchar reduction in the
10013 result string. */
10014 mayshrink = (maxchar_str2 < maxchar);
10015 maxchar = Py_MAX(maxchar, maxchar_str2);
10016
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010017 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +000010018 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010019 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010020 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010021 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010022 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010023 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010024 Py_UCS4 u1, u2;
10025 int rkind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010026 u1 = PyUnicode_READ_CHAR(str1, 0);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +020010027 if (findchar(sbuf, PyUnicode_KIND(self),
10028 slen, u1, 1) < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010029 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010030 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010031 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010032 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010033 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010034 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010035 rkind = PyUnicode_KIND(u);
10036 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
10037 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010038 if (--maxcount < 0)
10039 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010040 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010041 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010042 }
10043 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010044 int rkind = skind;
10045 char *res;
Victor Stinner25a4b292011-10-06 12:31:55 +020010046
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010047 if (kind1 < rkind) {
10048 /* widen substring */
10049 buf1 = _PyUnicode_AsKind(str1, rkind);
10050 if (!buf1) goto error;
10051 release1 = 1;
10052 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010053 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010054 if (i < 0)
10055 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010056 if (rkind > kind2) {
10057 /* widen replacement */
10058 buf2 = _PyUnicode_AsKind(str2, rkind);
10059 if (!buf2) goto error;
10060 release2 = 1;
10061 }
10062 else if (rkind < kind2) {
10063 /* widen self and buf1 */
10064 rkind = kind2;
10065 if (release1) PyMem_Free(buf1);
10066 sbuf = _PyUnicode_AsKind(self, rkind);
10067 if (!sbuf) goto error;
10068 srelease = 1;
10069 buf1 = _PyUnicode_AsKind(str1, rkind);
10070 if (!buf1) goto error;
10071 release1 = 1;
10072 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010073 u = PyUnicode_New(slen, maxchar);
10074 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010075 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010076 assert(PyUnicode_KIND(u) == rkind);
10077 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010078
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010079 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010080 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010081 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010082 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010083 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010084 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010085
10086 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010087 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010088 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010089 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010090 if (i == -1)
10091 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010092 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010093 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010094 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010095 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010096 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010097 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010098 }
10099 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010100 Py_ssize_t n, i, j, ires;
10101 Py_ssize_t product, new_size;
10102 int rkind = skind;
10103 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010104
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010105 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010106 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010107 buf1 = _PyUnicode_AsKind(str1, rkind);
10108 if (!buf1) goto error;
10109 release1 = 1;
10110 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010111 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010112 if (n == 0)
10113 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010114 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010115 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010116 buf2 = _PyUnicode_AsKind(str2, rkind);
10117 if (!buf2) goto error;
10118 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010119 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010120 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010121 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010122 rkind = kind2;
10123 sbuf = _PyUnicode_AsKind(self, rkind);
10124 if (!sbuf) goto error;
10125 srelease = 1;
10126 if (release1) PyMem_Free(buf1);
10127 buf1 = _PyUnicode_AsKind(str1, rkind);
10128 if (!buf1) goto error;
10129 release1 = 1;
10130 }
10131 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10132 PyUnicode_GET_LENGTH(str1))); */
10133 product = n * (len2-len1);
10134 if ((product / (len2-len1)) != n) {
10135 PyErr_SetString(PyExc_OverflowError,
10136 "replace string is too long");
10137 goto error;
10138 }
10139 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +020010140 if (new_size == 0) {
10141 Py_INCREF(unicode_empty);
10142 u = unicode_empty;
10143 goto done;
10144 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010145 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10146 PyErr_SetString(PyExc_OverflowError,
10147 "replace string is too long");
10148 goto error;
10149 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010150 u = PyUnicode_New(new_size, maxchar);
10151 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010152 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010153 assert(PyUnicode_KIND(u) == rkind);
10154 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010155 ires = i = 0;
10156 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010157 while (n-- > 0) {
10158 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010159 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010160 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010161 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010162 if (j == -1)
10163 break;
10164 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010165 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010166 memcpy(res + rkind * ires,
10167 sbuf + rkind * i,
10168 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010169 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010170 }
10171 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010172 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010173 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010174 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010175 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010176 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010177 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010178 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010179 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010180 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010181 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010182 memcpy(res + rkind * ires,
10183 sbuf + rkind * i,
10184 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010185 }
10186 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010187 /* interleave */
10188 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010189 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010190 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010191 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010192 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010193 if (--n <= 0)
10194 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010195 memcpy(res + rkind * ires,
10196 sbuf + rkind * i,
10197 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010198 ires++;
10199 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010200 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010201 memcpy(res + rkind * ires,
10202 sbuf + rkind * i,
10203 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010204 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010205 }
10206
10207 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010208 unicode_adjust_maxchar(&u);
10209 if (u == NULL)
10210 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010211 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010212
10213 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010214 if (srelease)
10215 PyMem_FREE(sbuf);
10216 if (release1)
10217 PyMem_FREE(buf1);
10218 if (release2)
10219 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010220 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010221 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010222
Benjamin Peterson29060642009-01-31 22:14:21 +000010223 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010224 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010225 if (srelease)
10226 PyMem_FREE(sbuf);
10227 if (release1)
10228 PyMem_FREE(buf1);
10229 if (release2)
10230 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010231 return unicode_result_unchanged(self);
10232
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010233 error:
10234 if (srelease && sbuf)
10235 PyMem_FREE(sbuf);
10236 if (release1 && buf1)
10237 PyMem_FREE(buf1);
10238 if (release2 && buf2)
10239 PyMem_FREE(buf2);
10240 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010241}
10242
10243/* --- Unicode Object Methods --------------------------------------------- */
10244
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010245PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010246 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010247\n\
10248Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010249characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010250
10251static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010252unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010253{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010254 return fixup(self, fixtitle);
10255}
10256
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010257PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010258 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010259\n\
10260Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010261have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010262
10263static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010264unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010265{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010266 return fixup(self, fixcapitalize);
10267}
10268
10269#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010270PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010271 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010272\n\
10273Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010274normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010275
10276static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010277unicode_capwords(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010278{
10279 PyObject *list;
10280 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010281 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010282
Guido van Rossumd57fd912000-03-10 22:53:23 +000010283 /* Split into words */
10284 list = split(self, NULL, -1);
10285 if (!list)
10286 return NULL;
10287
10288 /* Capitalize each word */
10289 for (i = 0; i < PyList_GET_SIZE(list); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010290 item = fixup(PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +000010291 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010292 if (item == NULL)
10293 goto onError;
10294 Py_DECREF(PyList_GET_ITEM(list, i));
10295 PyList_SET_ITEM(list, i, item);
10296 }
10297
10298 /* Join the words to form a new string */
10299 item = PyUnicode_Join(NULL, list);
10300
Benjamin Peterson29060642009-01-31 22:14:21 +000010301 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010302 Py_DECREF(list);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010303 return item;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010304}
10305#endif
10306
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010307/* Argument converter. Coerces to a single unicode character */
10308
10309static int
10310convert_uc(PyObject *obj, void *addr)
10311{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010312 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010313 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010314
Benjamin Peterson14339b62009-01-31 16:36:08 +000010315 uniobj = PyUnicode_FromObject(obj);
10316 if (uniobj == NULL) {
10317 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010318 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010319 return 0;
10320 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010321 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010322 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010323 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010324 Py_DECREF(uniobj);
10325 return 0;
10326 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010327 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010328 Py_DECREF(uniobj);
10329 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010330}
10331
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010332PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010333 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010334\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010335Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010336done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010337
10338static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010339unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010340{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010341 Py_ssize_t marg, left;
10342 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010343 Py_UCS4 fillchar = ' ';
10344
Victor Stinnere9a29352011-10-01 02:14:59 +020010345 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010346 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010347
Victor Stinnerc4b49542011-12-11 22:44:26 +010010348 if (PyUnicode_READY(self) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010349 return NULL;
10350
Victor Stinnerc4b49542011-12-11 22:44:26 +010010351 if (PyUnicode_GET_LENGTH(self) >= width)
10352 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010353
Victor Stinnerc4b49542011-12-11 22:44:26 +010010354 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010355 left = marg / 2 + (marg & width & 1);
10356
Victor Stinner9310abb2011-10-05 00:59:23 +020010357 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010358}
10359
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010360/* This function assumes that str1 and str2 are readied by the caller. */
10361
Marc-André Lemburge5034372000-08-08 08:04:29 +000010362static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010363unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010364{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010365 int kind1, kind2;
10366 void *data1, *data2;
10367 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010368
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010369 kind1 = PyUnicode_KIND(str1);
10370 kind2 = PyUnicode_KIND(str2);
10371 data1 = PyUnicode_DATA(str1);
10372 data2 = PyUnicode_DATA(str2);
10373 len1 = PyUnicode_GET_LENGTH(str1);
10374 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010375
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010376 for (i = 0; i < len1 && i < len2; ++i) {
10377 Py_UCS4 c1, c2;
10378 c1 = PyUnicode_READ(kind1, data1, i);
10379 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010380
10381 if (c1 != c2)
10382 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010383 }
10384
10385 return (len1 < len2) ? -1 : (len1 != len2);
10386}
10387
Alexander Belopolsky40018472011-02-26 01:02:56 +000010388int
10389PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010390{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010391 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10392 if (PyUnicode_READY(left) == -1 ||
10393 PyUnicode_READY(right) == -1)
10394 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010395 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010396 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010397 PyErr_Format(PyExc_TypeError,
10398 "Can't compare %.100s and %.100s",
10399 left->ob_type->tp_name,
10400 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010401 return -1;
10402}
10403
Martin v. Löwis5b222132007-06-10 09:51:05 +000010404int
10405PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10406{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010407 Py_ssize_t i;
10408 int kind;
10409 void *data;
10410 Py_UCS4 chr;
10411
Victor Stinner910337b2011-10-03 03:20:16 +020010412 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010413 if (PyUnicode_READY(uni) == -1)
10414 return -1;
10415 kind = PyUnicode_KIND(uni);
10416 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010417 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010418 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10419 if (chr != str[i])
10420 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010421 /* This check keeps Python strings that end in '\0' from comparing equal
10422 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010423 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010424 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010425 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010426 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010427 return 0;
10428}
10429
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010430
Benjamin Peterson29060642009-01-31 22:14:21 +000010431#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010432 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010433
Alexander Belopolsky40018472011-02-26 01:02:56 +000010434PyObject *
10435PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010436{
10437 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010438
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010439 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10440 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010441 if (PyUnicode_READY(left) == -1 ||
10442 PyUnicode_READY(right) == -1)
10443 return NULL;
10444 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10445 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010446 if (op == Py_EQ) {
10447 Py_INCREF(Py_False);
10448 return Py_False;
10449 }
10450 if (op == Py_NE) {
10451 Py_INCREF(Py_True);
10452 return Py_True;
10453 }
10454 }
10455 if (left == right)
10456 result = 0;
10457 else
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010458 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010459
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010460 /* Convert the return value to a Boolean */
10461 switch (op) {
10462 case Py_EQ:
10463 v = TEST_COND(result == 0);
10464 break;
10465 case Py_NE:
10466 v = TEST_COND(result != 0);
10467 break;
10468 case Py_LE:
10469 v = TEST_COND(result <= 0);
10470 break;
10471 case Py_GE:
10472 v = TEST_COND(result >= 0);
10473 break;
10474 case Py_LT:
10475 v = TEST_COND(result == -1);
10476 break;
10477 case Py_GT:
10478 v = TEST_COND(result == 1);
10479 break;
10480 default:
10481 PyErr_BadArgument();
10482 return NULL;
10483 }
10484 Py_INCREF(v);
10485 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010486 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010487
Brian Curtindfc80e32011-08-10 20:28:54 -050010488 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010489}
10490
Alexander Belopolsky40018472011-02-26 01:02:56 +000010491int
10492PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010493{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010494 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010495 int kind1, kind2, kind;
10496 void *buf1, *buf2;
10497 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010498 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010499
10500 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010501 sub = PyUnicode_FromObject(element);
10502 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010503 PyErr_Format(PyExc_TypeError,
10504 "'in <string>' requires string as left operand, not %s",
10505 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010506 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010507 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010508 if (PyUnicode_READY(sub) == -1)
10509 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010510
Thomas Wouters477c8d52006-05-27 19:21:47 +000010511 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +020010512 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010513 Py_DECREF(sub);
10514 return -1;
10515 }
10516
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010517 kind1 = PyUnicode_KIND(str);
10518 kind2 = PyUnicode_KIND(sub);
10519 kind = kind1 > kind2 ? kind1 : kind2;
10520 buf1 = PyUnicode_DATA(str);
10521 buf2 = PyUnicode_DATA(sub);
10522 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010523 buf1 = _PyUnicode_AsKind(str, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010524 if (!buf1) {
10525 Py_DECREF(sub);
10526 return -1;
10527 }
10528 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010529 buf2 = _PyUnicode_AsKind(sub, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010530 if (!buf2) {
10531 Py_DECREF(sub);
10532 if (kind1 != kind) PyMem_Free(buf1);
10533 return -1;
10534 }
10535 len1 = PyUnicode_GET_LENGTH(str);
10536 len2 = PyUnicode_GET_LENGTH(sub);
10537
10538 switch(kind) {
10539 case PyUnicode_1BYTE_KIND:
10540 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10541 break;
10542 case PyUnicode_2BYTE_KIND:
10543 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10544 break;
10545 case PyUnicode_4BYTE_KIND:
10546 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10547 break;
10548 default:
10549 result = -1;
10550 assert(0);
10551 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010552
10553 Py_DECREF(str);
10554 Py_DECREF(sub);
10555
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010556 if (kind1 != kind)
10557 PyMem_Free(buf1);
10558 if (kind2 != kind)
10559 PyMem_Free(buf2);
10560
Guido van Rossum403d68b2000-03-13 15:55:09 +000010561 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010562}
10563
Guido van Rossumd57fd912000-03-10 22:53:23 +000010564/* Concat to string or Unicode object giving a new Unicode object. */
10565
Alexander Belopolsky40018472011-02-26 01:02:56 +000010566PyObject *
10567PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010568{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010569 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010570 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010571 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010572
10573 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010574 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010575 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010576 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010577 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010578 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010579 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010580
10581 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010582 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010583 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010584 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010585 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010586 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010587 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010588 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010589 }
10590
Victor Stinner488fa492011-12-12 00:01:39 +010010591 u_len = PyUnicode_GET_LENGTH(u);
10592 v_len = PyUnicode_GET_LENGTH(v);
10593 if (u_len > PY_SSIZE_T_MAX - v_len) {
10594 PyErr_SetString(PyExc_OverflowError,
10595 "strings are too large to concat");
10596 goto onError;
10597 }
10598 new_len = u_len + v_len;
10599
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010600 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010601 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
10602 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010603
Guido van Rossumd57fd912000-03-10 22:53:23 +000010604 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010010605 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010606 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010607 goto onError;
Victor Stinner488fa492011-12-12 00:01:39 +010010608 copy_characters(w, 0, u, 0, u_len);
10609 copy_characters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010610 Py_DECREF(u);
10611 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010612 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010613 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010614
Benjamin Peterson29060642009-01-31 22:14:21 +000010615 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010616 Py_XDECREF(u);
10617 Py_XDECREF(v);
10618 return NULL;
10619}
10620
Walter Dörwald1ab83302007-05-18 17:15:44 +000010621void
Victor Stinner23e56682011-10-03 03:54:37 +020010622PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010623{
Victor Stinner23e56682011-10-03 03:54:37 +020010624 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010010625 Py_UCS4 maxchar, maxchar2;
10626 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020010627
10628 if (p_left == NULL) {
10629 if (!PyErr_Occurred())
10630 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010631 return;
10632 }
Victor Stinner23e56682011-10-03 03:54:37 +020010633 left = *p_left;
10634 if (right == NULL || !PyUnicode_Check(left)) {
10635 if (!PyErr_Occurred())
10636 PyErr_BadInternalCall();
10637 goto error;
10638 }
10639
Victor Stinnere1335c72011-10-04 20:53:03 +020010640 if (PyUnicode_READY(left))
10641 goto error;
10642 if (PyUnicode_READY(right))
10643 goto error;
10644
Victor Stinner488fa492011-12-12 00:01:39 +010010645 /* Shortcuts */
10646 if (left == unicode_empty) {
10647 Py_DECREF(left);
10648 Py_INCREF(right);
10649 *p_left = right;
10650 return;
10651 }
10652 if (right == unicode_empty)
10653 return;
10654
10655 left_len = PyUnicode_GET_LENGTH(left);
10656 right_len = PyUnicode_GET_LENGTH(right);
10657 if (left_len > PY_SSIZE_T_MAX - right_len) {
10658 PyErr_SetString(PyExc_OverflowError,
10659 "strings are too large to concat");
10660 goto error;
10661 }
10662 new_len = left_len + right_len;
10663
10664 if (unicode_modifiable(left)
10665 && PyUnicode_CheckExact(right)
10666 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020010667 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10668 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010669 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010670 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010010671 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10672 {
10673 /* append inplace */
10674 if (unicode_resize(p_left, new_len) != 0) {
10675 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10676 * deallocated so it cannot be put back into
10677 * 'variable'. The MemoryError is raised when there
10678 * is no value in 'variable', which might (very
10679 * remotely) be a cause of incompatibilities.
10680 */
10681 goto error;
Victor Stinner23e56682011-10-03 03:54:37 +020010682 }
Victor Stinner488fa492011-12-12 00:01:39 +010010683 /* copy 'right' into the newly allocated area of 'left' */
10684 copy_characters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020010685 }
Victor Stinner488fa492011-12-12 00:01:39 +010010686 else {
10687 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
10688 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
10689 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020010690
Victor Stinner488fa492011-12-12 00:01:39 +010010691 /* Concat the two Unicode strings */
10692 res = PyUnicode_New(new_len, maxchar);
10693 if (res == NULL)
10694 goto error;
10695 copy_characters(res, 0, left, 0, left_len);
10696 copy_characters(res, left_len, right, 0, right_len);
10697 Py_DECREF(left);
10698 *p_left = res;
10699 }
10700 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010701 return;
10702
10703error:
Victor Stinner488fa492011-12-12 00:01:39 +010010704 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010705}
10706
10707void
10708PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10709{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010710 PyUnicode_Append(pleft, right);
10711 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010712}
10713
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010714PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010715 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010716\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010717Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010718string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010719interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010720
10721static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010722unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010723{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010724 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010725 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010726 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010727 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010728 int kind1, kind2, kind;
10729 void *buf1, *buf2;
10730 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010731
Jesus Ceaac451502011-04-20 17:09:23 +020010732 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10733 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010734 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010735
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010736 kind1 = PyUnicode_KIND(self);
10737 kind2 = PyUnicode_KIND(substring);
10738 kind = kind1 > kind2 ? kind1 : kind2;
10739 buf1 = PyUnicode_DATA(self);
10740 buf2 = PyUnicode_DATA(substring);
10741 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010742 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010743 if (!buf1) {
10744 Py_DECREF(substring);
10745 return NULL;
10746 }
10747 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010748 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010749 if (!buf2) {
10750 Py_DECREF(substring);
10751 if (kind1 != kind) PyMem_Free(buf1);
10752 return NULL;
10753 }
10754 len1 = PyUnicode_GET_LENGTH(self);
10755 len2 = PyUnicode_GET_LENGTH(substring);
10756
10757 ADJUST_INDICES(start, end, len1);
10758 switch(kind) {
10759 case PyUnicode_1BYTE_KIND:
10760 iresult = ucs1lib_count(
10761 ((Py_UCS1*)buf1) + start, end - start,
10762 buf2, len2, PY_SSIZE_T_MAX
10763 );
10764 break;
10765 case PyUnicode_2BYTE_KIND:
10766 iresult = ucs2lib_count(
10767 ((Py_UCS2*)buf1) + start, end - start,
10768 buf2, len2, PY_SSIZE_T_MAX
10769 );
10770 break;
10771 case PyUnicode_4BYTE_KIND:
10772 iresult = ucs4lib_count(
10773 ((Py_UCS4*)buf1) + start, end - start,
10774 buf2, len2, PY_SSIZE_T_MAX
10775 );
10776 break;
10777 default:
10778 assert(0); iresult = 0;
10779 }
10780
10781 result = PyLong_FromSsize_t(iresult);
10782
10783 if (kind1 != kind)
10784 PyMem_Free(buf1);
10785 if (kind2 != kind)
10786 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010787
10788 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010789
Guido van Rossumd57fd912000-03-10 22:53:23 +000010790 return result;
10791}
10792
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010793PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010794 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010795\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010796Encode S using the codec registered for encoding. Default encoding\n\
10797is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010798handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010799a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10800'xmlcharrefreplace' as well as any other name registered with\n\
10801codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010802
10803static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010804unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010805{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010806 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010807 char *encoding = NULL;
10808 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010809
Benjamin Peterson308d6372009-09-18 21:42:35 +000010810 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10811 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010812 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010813 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010814}
10815
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010816PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010817 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010818\n\
10819Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010820If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010821
10822static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010823unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010824{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010825 Py_ssize_t i, j, line_pos, src_len, incr;
10826 Py_UCS4 ch;
10827 PyObject *u;
10828 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010829 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010830 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010831 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010832
10833 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010834 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010835
Antoine Pitrou22425222011-10-04 19:10:51 +020010836 if (PyUnicode_READY(self) == -1)
10837 return NULL;
10838
Thomas Wouters7e474022000-07-16 12:04:32 +000010839 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010840 src_len = PyUnicode_GET_LENGTH(self);
10841 i = j = line_pos = 0;
10842 kind = PyUnicode_KIND(self);
10843 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010844 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010845 for (; i < src_len; i++) {
10846 ch = PyUnicode_READ(kind, src_data, i);
10847 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010848 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010849 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010850 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010851 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010852 goto overflow;
10853 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010854 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010855 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010856 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010857 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010858 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010859 goto overflow;
10860 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010861 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010862 if (ch == '\n' || ch == '\r')
10863 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010864 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010865 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010010866 if (!found)
10867 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010868
Guido van Rossumd57fd912000-03-10 22:53:23 +000010869 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010870 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010871 if (!u)
10872 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010873 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010874
Antoine Pitroue71d5742011-10-04 15:55:09 +020010875 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010876
Antoine Pitroue71d5742011-10-04 15:55:09 +020010877 for (; i < src_len; i++) {
10878 ch = PyUnicode_READ(kind, src_data, i);
10879 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010880 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010881 incr = tabsize - (line_pos % tabsize);
10882 line_pos += incr;
10883 while (incr--) {
10884 PyUnicode_WRITE(kind, dest_data, j, ' ');
10885 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010886 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010887 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010888 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010889 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010890 line_pos++;
10891 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010892 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010893 if (ch == '\n' || ch == '\r')
10894 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010895 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010896 }
10897 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010010898 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010899
Antoine Pitroue71d5742011-10-04 15:55:09 +020010900 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010901 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10902 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010903}
10904
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010905PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010906 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010907\n\
10908Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010909such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010910arguments start and end are interpreted as in slice notation.\n\
10911\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010912Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010913
10914static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010915unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010916{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010917 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010918 Py_ssize_t start;
10919 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010920 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010921
Jesus Ceaac451502011-04-20 17:09:23 +020010922 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10923 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010924 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010925
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010926 if (PyUnicode_READY(self) == -1)
10927 return NULL;
10928 if (PyUnicode_READY(substring) == -1)
10929 return NULL;
10930
Victor Stinner7931d9a2011-11-04 00:22:48 +010010931 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010932
10933 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010934
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010935 if (result == -2)
10936 return NULL;
10937
Christian Heimes217cfd12007-12-02 14:31:20 +000010938 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010939}
10940
10941static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010942unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010943{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010944 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10945 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010946 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010947 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010948}
10949
Guido van Rossumc2504932007-09-18 19:42:40 +000010950/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010951 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010952static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010953unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010954{
Guido van Rossumc2504932007-09-18 19:42:40 +000010955 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010010956 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010957
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010958 if (_PyUnicode_HASH(self) != -1)
10959 return _PyUnicode_HASH(self);
10960 if (PyUnicode_READY(self) == -1)
10961 return -1;
10962 len = PyUnicode_GET_LENGTH(self);
10963
10964 /* The hash function as a macro, gets expanded three times below. */
10965#define HASH(P) \
10966 x = (Py_uhash_t)*P << 7; \
10967 while (--len >= 0) \
10968 x = (1000003*x) ^ (Py_uhash_t)*P++;
10969
10970 switch (PyUnicode_KIND(self)) {
10971 case PyUnicode_1BYTE_KIND: {
10972 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10973 HASH(c);
10974 break;
10975 }
10976 case PyUnicode_2BYTE_KIND: {
10977 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10978 HASH(s);
10979 break;
10980 }
10981 default: {
10982 Py_UCS4 *l;
10983 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10984 "Impossible switch case in unicode_hash");
10985 l = PyUnicode_4BYTE_DATA(self);
10986 HASH(l);
10987 break;
10988 }
10989 }
10990 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
10991
Guido van Rossumc2504932007-09-18 19:42:40 +000010992 if (x == -1)
10993 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010994 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010995 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010996}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010997#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000010998
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010999PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011000 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011001\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011002Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011003
11004static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011005unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011006{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011007 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011008 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011009 Py_ssize_t start;
11010 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011011
Jesus Ceaac451502011-04-20 17:09:23 +020011012 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11013 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011014 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011015
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011016 if (PyUnicode_READY(self) == -1)
11017 return NULL;
11018 if (PyUnicode_READY(substring) == -1)
11019 return NULL;
11020
Victor Stinner7931d9a2011-11-04 00:22:48 +010011021 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011022
11023 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011024
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011025 if (result == -2)
11026 return NULL;
11027
Guido van Rossumd57fd912000-03-10 22:53:23 +000011028 if (result < 0) {
11029 PyErr_SetString(PyExc_ValueError, "substring not found");
11030 return NULL;
11031 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011032
Christian Heimes217cfd12007-12-02 14:31:20 +000011033 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011034}
11035
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011036PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011037 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011038\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011039Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011040at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011041
11042static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011043unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011044{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011045 Py_ssize_t i, length;
11046 int kind;
11047 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011048 int cased;
11049
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011050 if (PyUnicode_READY(self) == -1)
11051 return NULL;
11052 length = PyUnicode_GET_LENGTH(self);
11053 kind = PyUnicode_KIND(self);
11054 data = PyUnicode_DATA(self);
11055
Guido van Rossumd57fd912000-03-10 22:53:23 +000011056 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011057 if (length == 1)
11058 return PyBool_FromLong(
11059 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011060
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011061 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011062 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011063 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011064
Guido van Rossumd57fd912000-03-10 22:53:23 +000011065 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011066 for (i = 0; i < length; i++) {
11067 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011068
Benjamin Peterson29060642009-01-31 22:14:21 +000011069 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11070 return PyBool_FromLong(0);
11071 else if (!cased && Py_UNICODE_ISLOWER(ch))
11072 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011073 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011074 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011075}
11076
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011077PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011078 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011079\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011080Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011081at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011082
11083static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011084unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011085{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011086 Py_ssize_t i, length;
11087 int kind;
11088 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011089 int cased;
11090
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011091 if (PyUnicode_READY(self) == -1)
11092 return NULL;
11093 length = PyUnicode_GET_LENGTH(self);
11094 kind = PyUnicode_KIND(self);
11095 data = PyUnicode_DATA(self);
11096
Guido van Rossumd57fd912000-03-10 22:53:23 +000011097 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011098 if (length == 1)
11099 return PyBool_FromLong(
11100 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011101
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011102 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011103 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011104 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011105
Guido van Rossumd57fd912000-03-10 22:53:23 +000011106 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011107 for (i = 0; i < length; i++) {
11108 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011109
Benjamin Peterson29060642009-01-31 22:14:21 +000011110 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11111 return PyBool_FromLong(0);
11112 else if (!cased && Py_UNICODE_ISUPPER(ch))
11113 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011114 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011115 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011116}
11117
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011118PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011119 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011120\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011121Return True if S is a titlecased string and there is at least one\n\
11122character in S, i.e. upper- and titlecase characters may only\n\
11123follow uncased characters and lowercase characters only cased ones.\n\
11124Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011125
11126static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011127unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011128{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011129 Py_ssize_t i, length;
11130 int kind;
11131 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011132 int cased, previous_is_cased;
11133
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011134 if (PyUnicode_READY(self) == -1)
11135 return NULL;
11136 length = PyUnicode_GET_LENGTH(self);
11137 kind = PyUnicode_KIND(self);
11138 data = PyUnicode_DATA(self);
11139
Guido van Rossumd57fd912000-03-10 22:53:23 +000011140 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011141 if (length == 1) {
11142 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11143 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11144 (Py_UNICODE_ISUPPER(ch) != 0));
11145 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011146
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011147 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011148 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011149 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011150
Guido van Rossumd57fd912000-03-10 22:53:23 +000011151 cased = 0;
11152 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011153 for (i = 0; i < length; i++) {
11154 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011155
Benjamin Peterson29060642009-01-31 22:14:21 +000011156 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11157 if (previous_is_cased)
11158 return PyBool_FromLong(0);
11159 previous_is_cased = 1;
11160 cased = 1;
11161 }
11162 else if (Py_UNICODE_ISLOWER(ch)) {
11163 if (!previous_is_cased)
11164 return PyBool_FromLong(0);
11165 previous_is_cased = 1;
11166 cased = 1;
11167 }
11168 else
11169 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011170 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011171 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011172}
11173
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011174PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011175 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011176\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011177Return True if all characters in S are whitespace\n\
11178and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011179
11180static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011181unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011182{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011183 Py_ssize_t i, length;
11184 int kind;
11185 void *data;
11186
11187 if (PyUnicode_READY(self) == -1)
11188 return NULL;
11189 length = PyUnicode_GET_LENGTH(self);
11190 kind = PyUnicode_KIND(self);
11191 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011192
Guido van Rossumd57fd912000-03-10 22:53:23 +000011193 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011194 if (length == 1)
11195 return PyBool_FromLong(
11196 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011197
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011198 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011199 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011200 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011201
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011202 for (i = 0; i < length; i++) {
11203 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011204 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011205 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011206 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011207 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011208}
11209
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011210PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011211 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011212\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011213Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011214and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011215
11216static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011217unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011218{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011219 Py_ssize_t i, length;
11220 int kind;
11221 void *data;
11222
11223 if (PyUnicode_READY(self) == -1)
11224 return NULL;
11225 length = PyUnicode_GET_LENGTH(self);
11226 kind = PyUnicode_KIND(self);
11227 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011228
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011229 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011230 if (length == 1)
11231 return PyBool_FromLong(
11232 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011233
11234 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011235 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011236 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011237
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011238 for (i = 0; i < length; i++) {
11239 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011240 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011241 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011242 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011243}
11244
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011245PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011246 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011247\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011248Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011249and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011250
11251static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011252unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011253{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011254 int kind;
11255 void *data;
11256 Py_ssize_t len, i;
11257
11258 if (PyUnicode_READY(self) == -1)
11259 return NULL;
11260
11261 kind = PyUnicode_KIND(self);
11262 data = PyUnicode_DATA(self);
11263 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011264
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011265 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011266 if (len == 1) {
11267 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11268 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11269 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011270
11271 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011272 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011273 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011274
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011275 for (i = 0; i < len; i++) {
11276 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011277 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011278 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011279 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011280 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011281}
11282
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011283PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011284 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011285\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011286Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011287False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011288
11289static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011290unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011291{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011292 Py_ssize_t i, length;
11293 int kind;
11294 void *data;
11295
11296 if (PyUnicode_READY(self) == -1)
11297 return NULL;
11298 length = PyUnicode_GET_LENGTH(self);
11299 kind = PyUnicode_KIND(self);
11300 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011301
Guido van Rossumd57fd912000-03-10 22:53:23 +000011302 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011303 if (length == 1)
11304 return PyBool_FromLong(
11305 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011306
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011307 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011308 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011309 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011310
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011311 for (i = 0; i < length; i++) {
11312 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011313 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011314 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011315 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011316}
11317
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011318PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011319 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011320\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011321Return True if all characters in S are digits\n\
11322and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011323
11324static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011325unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011326{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011327 Py_ssize_t i, length;
11328 int kind;
11329 void *data;
11330
11331 if (PyUnicode_READY(self) == -1)
11332 return NULL;
11333 length = PyUnicode_GET_LENGTH(self);
11334 kind = PyUnicode_KIND(self);
11335 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011336
Guido van Rossumd57fd912000-03-10 22:53:23 +000011337 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011338 if (length == 1) {
11339 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11340 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11341 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011342
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011343 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011344 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011345 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011346
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011347 for (i = 0; i < length; i++) {
11348 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011349 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011350 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011351 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011352}
11353
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011354PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011355 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011356\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011357Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011358False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011359
11360static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011361unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011362{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011363 Py_ssize_t i, length;
11364 int kind;
11365 void *data;
11366
11367 if (PyUnicode_READY(self) == -1)
11368 return NULL;
11369 length = PyUnicode_GET_LENGTH(self);
11370 kind = PyUnicode_KIND(self);
11371 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011372
Guido van Rossumd57fd912000-03-10 22:53:23 +000011373 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011374 if (length == 1)
11375 return PyBool_FromLong(
11376 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011377
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011378 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011379 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011380 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011381
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011382 for (i = 0; i < length; i++) {
11383 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011384 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011385 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011386 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011387}
11388
Martin v. Löwis47383402007-08-15 07:32:56 +000011389int
11390PyUnicode_IsIdentifier(PyObject *self)
11391{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011392 int kind;
11393 void *data;
11394 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011395 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011396
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011397 if (PyUnicode_READY(self) == -1) {
11398 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011399 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011400 }
11401
11402 /* Special case for empty strings */
11403 if (PyUnicode_GET_LENGTH(self) == 0)
11404 return 0;
11405 kind = PyUnicode_KIND(self);
11406 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011407
11408 /* PEP 3131 says that the first character must be in
11409 XID_Start and subsequent characters in XID_Continue,
11410 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011411 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011412 letters, digits, underscore). However, given the current
11413 definition of XID_Start and XID_Continue, it is sufficient
11414 to check just for these, except that _ must be allowed
11415 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011416 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011417 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011418 return 0;
11419
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011420 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011421 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011422 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011423 return 1;
11424}
11425
11426PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011427 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011428\n\
11429Return True if S is a valid identifier according\n\
11430to the language definition.");
11431
11432static PyObject*
11433unicode_isidentifier(PyObject *self)
11434{
11435 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11436}
11437
Georg Brandl559e5d72008-06-11 18:37:52 +000011438PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011439 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011440\n\
11441Return True if all characters in S are considered\n\
11442printable in repr() or S is empty, False otherwise.");
11443
11444static PyObject*
11445unicode_isprintable(PyObject *self)
11446{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011447 Py_ssize_t i, length;
11448 int kind;
11449 void *data;
11450
11451 if (PyUnicode_READY(self) == -1)
11452 return NULL;
11453 length = PyUnicode_GET_LENGTH(self);
11454 kind = PyUnicode_KIND(self);
11455 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011456
11457 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011458 if (length == 1)
11459 return PyBool_FromLong(
11460 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011461
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011462 for (i = 0; i < length; i++) {
11463 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011464 Py_RETURN_FALSE;
11465 }
11466 }
11467 Py_RETURN_TRUE;
11468}
11469
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011470PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011471 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011472\n\
11473Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011474iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011475
11476static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011477unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011478{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011479 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011480}
11481
Martin v. Löwis18e16552006-02-15 17:27:45 +000011482static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011483unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011484{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011485 if (PyUnicode_READY(self) == -1)
11486 return -1;
11487 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011488}
11489
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011490PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011491 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011492\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011493Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011494done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011495
11496static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011497unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011498{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011499 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011500 Py_UCS4 fillchar = ' ';
11501
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011502 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011503 return NULL;
11504
Victor Stinnerc4b49542011-12-11 22:44:26 +010011505 if (PyUnicode_READY(self) < 0)
11506 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011507
Victor Stinnerc4b49542011-12-11 22:44:26 +010011508 if (PyUnicode_GET_LENGTH(self) >= width)
11509 return unicode_result_unchanged(self);
11510
11511 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011512}
11513
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011514PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011515 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011516\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011517Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011518
11519static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011520unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011521{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011522 return fixup(self, fixlower);
11523}
11524
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011525#define LEFTSTRIP 0
11526#define RIGHTSTRIP 1
11527#define BOTHSTRIP 2
11528
11529/* Arrays indexed by above */
11530static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11531
11532#define STRIPNAME(i) (stripformat[i]+3)
11533
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011534/* externally visible for str.strip(unicode) */
11535PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011536_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011537{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011538 void *data;
11539 int kind;
11540 Py_ssize_t i, j, len;
11541 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011542
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011543 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11544 return NULL;
11545
11546 kind = PyUnicode_KIND(self);
11547 data = PyUnicode_DATA(self);
11548 len = PyUnicode_GET_LENGTH(self);
11549 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11550 PyUnicode_DATA(sepobj),
11551 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011552
Benjamin Peterson14339b62009-01-31 16:36:08 +000011553 i = 0;
11554 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011555 while (i < len &&
11556 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011557 i++;
11558 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011559 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011560
Benjamin Peterson14339b62009-01-31 16:36:08 +000011561 j = len;
11562 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011563 do {
11564 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011565 } while (j >= i &&
11566 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011567 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011568 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011569
Victor Stinner7931d9a2011-11-04 00:22:48 +010011570 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011571}
11572
11573PyObject*
11574PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11575{
11576 unsigned char *data;
11577 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011578 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011579
Victor Stinnerde636f32011-10-01 03:55:54 +020011580 if (PyUnicode_READY(self) == -1)
11581 return NULL;
11582
11583 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11584
Victor Stinner12bab6d2011-10-01 01:53:49 +020011585 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Victor Stinnerc4b49542011-12-11 22:44:26 +010011586 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011587
Victor Stinner12bab6d2011-10-01 01:53:49 +020011588 length = end - start;
11589 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011590 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011591
Victor Stinnerde636f32011-10-01 03:55:54 +020011592 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011593 PyErr_SetString(PyExc_IndexError, "string index out of range");
11594 return NULL;
11595 }
11596
Victor Stinnerb9275c12011-10-05 14:01:42 +020011597 if (PyUnicode_IS_ASCII(self)) {
11598 kind = PyUnicode_KIND(self);
11599 data = PyUnicode_1BYTE_DATA(self);
11600 return unicode_fromascii(data + start, length);
11601 }
11602 else {
11603 kind = PyUnicode_KIND(self);
11604 data = PyUnicode_1BYTE_DATA(self);
11605 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011606 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011607 length);
11608 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011609}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011610
11611static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011612do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011613{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011614 int kind;
11615 void *data;
11616 Py_ssize_t len, i, j;
11617
11618 if (PyUnicode_READY(self) == -1)
11619 return NULL;
11620
11621 kind = PyUnicode_KIND(self);
11622 data = PyUnicode_DATA(self);
11623 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011624
Benjamin Peterson14339b62009-01-31 16:36:08 +000011625 i = 0;
11626 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011627 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011628 i++;
11629 }
11630 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011631
Benjamin Peterson14339b62009-01-31 16:36:08 +000011632 j = len;
11633 if (striptype != LEFTSTRIP) {
11634 do {
11635 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011636 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011637 j++;
11638 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011639
Victor Stinner7931d9a2011-11-04 00:22:48 +010011640 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011641}
11642
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011643
11644static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011645do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011646{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011647 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011648
Benjamin Peterson14339b62009-01-31 16:36:08 +000011649 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11650 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011651
Benjamin Peterson14339b62009-01-31 16:36:08 +000011652 if (sep != NULL && sep != Py_None) {
11653 if (PyUnicode_Check(sep))
11654 return _PyUnicode_XStrip(self, striptype, sep);
11655 else {
11656 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011657 "%s arg must be None or str",
11658 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011659 return NULL;
11660 }
11661 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011662
Benjamin Peterson14339b62009-01-31 16:36:08 +000011663 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011664}
11665
11666
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011667PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011668 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011669\n\
11670Return a copy of the string S with leading and trailing\n\
11671whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011672If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011673
11674static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011675unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011676{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011677 if (PyTuple_GET_SIZE(args) == 0)
11678 return do_strip(self, BOTHSTRIP); /* Common case */
11679 else
11680 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011681}
11682
11683
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011684PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011685 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011686\n\
11687Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011688If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011689
11690static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011691unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011692{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011693 if (PyTuple_GET_SIZE(args) == 0)
11694 return do_strip(self, LEFTSTRIP); /* Common case */
11695 else
11696 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011697}
11698
11699
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011700PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011701 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011702\n\
11703Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011704If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011705
11706static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011707unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011708{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011709 if (PyTuple_GET_SIZE(args) == 0)
11710 return do_strip(self, RIGHTSTRIP); /* Common case */
11711 else
11712 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011713}
11714
11715
Guido van Rossumd57fd912000-03-10 22:53:23 +000011716static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011717unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011718{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011719 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011720 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011721
Georg Brandl222de0f2009-04-12 12:01:50 +000011722 if (len < 1) {
11723 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011724 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011725 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011726
Victor Stinnerc4b49542011-12-11 22:44:26 +010011727 /* no repeat, return original string */
11728 if (len == 1)
11729 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000011730
Victor Stinnerc4b49542011-12-11 22:44:26 +010011731 if (PyUnicode_READY(str) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011732 return NULL;
11733
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011734 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011735 PyErr_SetString(PyExc_OverflowError,
11736 "repeated string is too long");
11737 return NULL;
11738 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011739 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011740
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011741 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011742 if (!u)
11743 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011744 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011745
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011746 if (PyUnicode_GET_LENGTH(str) == 1) {
11747 const int kind = PyUnicode_KIND(str);
11748 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11749 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011750 if (kind == PyUnicode_1BYTE_KIND)
11751 memset(to, (unsigned char)fill_char, len);
11752 else {
11753 for (n = 0; n < len; ++n)
11754 PyUnicode_WRITE(kind, to, n, fill_char);
11755 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011756 }
11757 else {
11758 /* number of characters copied this far */
11759 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011760 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011761 char *to = (char *) PyUnicode_DATA(u);
11762 Py_MEMCPY(to, PyUnicode_DATA(str),
11763 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011764 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011765 n = (done <= nchars-done) ? done : nchars-done;
11766 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011767 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011768 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011769 }
11770
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011771 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011772 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011773}
11774
Alexander Belopolsky40018472011-02-26 01:02:56 +000011775PyObject *
11776PyUnicode_Replace(PyObject *obj,
11777 PyObject *subobj,
11778 PyObject *replobj,
11779 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011780{
11781 PyObject *self;
11782 PyObject *str1;
11783 PyObject *str2;
11784 PyObject *result;
11785
11786 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011787 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011788 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011789 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011790 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011791 Py_DECREF(self);
11792 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011793 }
11794 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011795 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011796 Py_DECREF(self);
11797 Py_DECREF(str1);
11798 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011799 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011800 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011801 Py_DECREF(self);
11802 Py_DECREF(str1);
11803 Py_DECREF(str2);
11804 return result;
11805}
11806
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011807PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011808 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011809\n\
11810Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011811old replaced by new. If the optional argument count is\n\
11812given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011813
11814static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011815unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011816{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011817 PyObject *str1;
11818 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011819 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011820 PyObject *result;
11821
Martin v. Löwis18e16552006-02-15 17:27:45 +000011822 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011823 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011824 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011825 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011826 str1 = PyUnicode_FromObject(str1);
11827 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11828 return NULL;
11829 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011830 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011831 Py_DECREF(str1);
11832 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011833 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011834
11835 result = replace(self, str1, str2, maxcount);
11836
11837 Py_DECREF(str1);
11838 Py_DECREF(str2);
11839 return result;
11840}
11841
Alexander Belopolsky40018472011-02-26 01:02:56 +000011842static PyObject *
11843unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011844{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011845 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011846 Py_ssize_t isize;
11847 Py_ssize_t osize, squote, dquote, i, o;
11848 Py_UCS4 max, quote;
11849 int ikind, okind;
11850 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011851
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011852 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011853 return NULL;
11854
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011855 isize = PyUnicode_GET_LENGTH(unicode);
11856 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011857
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011858 /* Compute length of output, quote characters, and
11859 maximum character */
11860 osize = 2; /* quotes */
11861 max = 127;
11862 squote = dquote = 0;
11863 ikind = PyUnicode_KIND(unicode);
11864 for (i = 0; i < isize; i++) {
11865 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11866 switch (ch) {
11867 case '\'': squote++; osize++; break;
11868 case '"': dquote++; osize++; break;
11869 case '\\': case '\t': case '\r': case '\n':
11870 osize += 2; break;
11871 default:
11872 /* Fast-path ASCII */
11873 if (ch < ' ' || ch == 0x7f)
11874 osize += 4; /* \xHH */
11875 else if (ch < 0x7f)
11876 osize++;
11877 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11878 osize++;
11879 max = ch > max ? ch : max;
11880 }
11881 else if (ch < 0x100)
11882 osize += 4; /* \xHH */
11883 else if (ch < 0x10000)
11884 osize += 6; /* \uHHHH */
11885 else
11886 osize += 10; /* \uHHHHHHHH */
11887 }
11888 }
11889
11890 quote = '\'';
11891 if (squote) {
11892 if (dquote)
11893 /* Both squote and dquote present. Use squote,
11894 and escape them */
11895 osize += squote;
11896 else
11897 quote = '"';
11898 }
11899
11900 repr = PyUnicode_New(osize, max);
11901 if (repr == NULL)
11902 return NULL;
11903 okind = PyUnicode_KIND(repr);
11904 odata = PyUnicode_DATA(repr);
11905
11906 PyUnicode_WRITE(okind, odata, 0, quote);
11907 PyUnicode_WRITE(okind, odata, osize-1, quote);
11908
11909 for (i = 0, o = 1; i < isize; i++) {
11910 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011911
11912 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011913 if ((ch == quote) || (ch == '\\')) {
11914 PyUnicode_WRITE(okind, odata, o++, '\\');
11915 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011916 continue;
11917 }
11918
Benjamin Peterson29060642009-01-31 22:14:21 +000011919 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011920 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011921 PyUnicode_WRITE(okind, odata, o++, '\\');
11922 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011923 }
11924 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011925 PyUnicode_WRITE(okind, odata, o++, '\\');
11926 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011927 }
11928 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011929 PyUnicode_WRITE(okind, odata, o++, '\\');
11930 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011931 }
11932
11933 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011934 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011935 PyUnicode_WRITE(okind, odata, o++, '\\');
11936 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011937 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
11938 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011939 }
11940
Georg Brandl559e5d72008-06-11 18:37:52 +000011941 /* Copy ASCII characters as-is */
11942 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011943 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011944 }
11945
Benjamin Peterson29060642009-01-31 22:14:21 +000011946 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011947 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011948 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011949 (categories Z* and C* except ASCII space)
11950 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011951 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011952 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011953 if (ch <= 0xff) {
11954 PyUnicode_WRITE(okind, odata, o++, '\\');
11955 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011956 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
11957 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011958 }
11959 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011960 else if (ch >= 0x10000) {
11961 PyUnicode_WRITE(okind, odata, o++, '\\');
11962 PyUnicode_WRITE(okind, odata, o++, 'U');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011963 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
11964 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
11965 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
11966 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
11967 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
11968 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
11969 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
11970 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011971 }
11972 /* Map 16-bit characters to '\uxxxx' */
11973 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011974 PyUnicode_WRITE(okind, odata, o++, '\\');
11975 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011976 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
11977 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
11978 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
11979 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011980 }
11981 }
11982 /* Copy characters as-is */
11983 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011984 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011985 }
11986 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000011987 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011988 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020011989 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000011990 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011991}
11992
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011993PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011994 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011995\n\
11996Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011997such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011998arguments start and end are interpreted as in slice notation.\n\
11999\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012000Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012001
12002static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012003unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012004{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012005 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012006 Py_ssize_t start;
12007 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012008 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012009
Jesus Ceaac451502011-04-20 17:09:23 +020012010 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12011 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012012 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012013
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012014 if (PyUnicode_READY(self) == -1)
12015 return NULL;
12016 if (PyUnicode_READY(substring) == -1)
12017 return NULL;
12018
Victor Stinner7931d9a2011-11-04 00:22:48 +010012019 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012020
12021 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012022
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012023 if (result == -2)
12024 return NULL;
12025
Christian Heimes217cfd12007-12-02 14:31:20 +000012026 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012027}
12028
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012029PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012030 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012031\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012032Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012033
12034static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012035unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012036{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012037 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012038 Py_ssize_t start;
12039 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012040 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012041
Jesus Ceaac451502011-04-20 17:09:23 +020012042 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12043 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012044 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012045
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012046 if (PyUnicode_READY(self) == -1)
12047 return NULL;
12048 if (PyUnicode_READY(substring) == -1)
12049 return NULL;
12050
Victor Stinner7931d9a2011-11-04 00:22:48 +010012051 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012052
12053 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012054
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012055 if (result == -2)
12056 return NULL;
12057
Guido van Rossumd57fd912000-03-10 22:53:23 +000012058 if (result < 0) {
12059 PyErr_SetString(PyExc_ValueError, "substring not found");
12060 return NULL;
12061 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012062
Christian Heimes217cfd12007-12-02 14:31:20 +000012063 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012064}
12065
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012066PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012067 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012068\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012069Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012070done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012071
12072static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012073unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012074{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012075 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012076 Py_UCS4 fillchar = ' ';
12077
Victor Stinnere9a29352011-10-01 02:14:59 +020012078 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012079 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012080
Victor Stinnerc4b49542011-12-11 22:44:26 +010012081 if (PyUnicode_READY(self) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012082 return NULL;
12083
Victor Stinnerc4b49542011-12-11 22:44:26 +010012084 if (PyUnicode_GET_LENGTH(self) >= width)
12085 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012086
Victor Stinnerc4b49542011-12-11 22:44:26 +010012087 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012088}
12089
Alexander Belopolsky40018472011-02-26 01:02:56 +000012090PyObject *
12091PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012092{
12093 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012094
Guido van Rossumd57fd912000-03-10 22:53:23 +000012095 s = PyUnicode_FromObject(s);
12096 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012097 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012098 if (sep != NULL) {
12099 sep = PyUnicode_FromObject(sep);
12100 if (sep == NULL) {
12101 Py_DECREF(s);
12102 return NULL;
12103 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012104 }
12105
Victor Stinner9310abb2011-10-05 00:59:23 +020012106 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012107
12108 Py_DECREF(s);
12109 Py_XDECREF(sep);
12110 return result;
12111}
12112
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012113PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012114 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012115\n\
12116Return a list of the words in S, using sep as the\n\
12117delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012118splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012119whitespace string is a separator and empty strings are\n\
12120removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012121
12122static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012123unicode_split(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012124{
12125 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012126 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012127
Martin v. Löwis18e16552006-02-15 17:27:45 +000012128 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012129 return NULL;
12130
12131 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012132 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012133 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012134 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012135 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012136 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012137}
12138
Thomas Wouters477c8d52006-05-27 19:21:47 +000012139PyObject *
12140PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12141{
12142 PyObject* str_obj;
12143 PyObject* sep_obj;
12144 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012145 int kind1, kind2, kind;
12146 void *buf1 = NULL, *buf2 = NULL;
12147 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012148
12149 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020012150 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012151 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012152 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012153 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012154 Py_DECREF(str_obj);
12155 return NULL;
12156 }
12157
Victor Stinner14f8f022011-10-05 20:58:25 +020012158 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012159 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012160 kind = Py_MAX(kind1, kind2);
12161 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012162 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012163 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012164 if (!buf1)
12165 goto onError;
12166 buf2 = PyUnicode_DATA(sep_obj);
12167 if (kind2 != kind)
12168 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12169 if (!buf2)
12170 goto onError;
12171 len1 = PyUnicode_GET_LENGTH(str_obj);
12172 len2 = PyUnicode_GET_LENGTH(sep_obj);
12173
Victor Stinner14f8f022011-10-05 20:58:25 +020012174 switch(PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012175 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012176 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12177 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12178 else
12179 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012180 break;
12181 case PyUnicode_2BYTE_KIND:
12182 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12183 break;
12184 case PyUnicode_4BYTE_KIND:
12185 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12186 break;
12187 default:
12188 assert(0);
12189 out = 0;
12190 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012191
12192 Py_DECREF(sep_obj);
12193 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012194 if (kind1 != kind)
12195 PyMem_Free(buf1);
12196 if (kind2 != kind)
12197 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012198
12199 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012200 onError:
12201 Py_DECREF(sep_obj);
12202 Py_DECREF(str_obj);
12203 if (kind1 != kind && buf1)
12204 PyMem_Free(buf1);
12205 if (kind2 != kind && buf2)
12206 PyMem_Free(buf2);
12207 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012208}
12209
12210
12211PyObject *
12212PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12213{
12214 PyObject* str_obj;
12215 PyObject* sep_obj;
12216 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012217 int kind1, kind2, kind;
12218 void *buf1 = NULL, *buf2 = NULL;
12219 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012220
12221 str_obj = PyUnicode_FromObject(str_in);
12222 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012223 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012224 sep_obj = PyUnicode_FromObject(sep_in);
12225 if (!sep_obj) {
12226 Py_DECREF(str_obj);
12227 return NULL;
12228 }
12229
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012230 kind1 = PyUnicode_KIND(str_in);
12231 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012232 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012233 buf1 = PyUnicode_DATA(str_in);
12234 if (kind1 != kind)
12235 buf1 = _PyUnicode_AsKind(str_in, kind);
12236 if (!buf1)
12237 goto onError;
12238 buf2 = PyUnicode_DATA(sep_obj);
12239 if (kind2 != kind)
12240 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12241 if (!buf2)
12242 goto onError;
12243 len1 = PyUnicode_GET_LENGTH(str_obj);
12244 len2 = PyUnicode_GET_LENGTH(sep_obj);
12245
12246 switch(PyUnicode_KIND(str_in)) {
12247 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012248 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12249 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12250 else
12251 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012252 break;
12253 case PyUnicode_2BYTE_KIND:
12254 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12255 break;
12256 case PyUnicode_4BYTE_KIND:
12257 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12258 break;
12259 default:
12260 assert(0);
12261 out = 0;
12262 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012263
12264 Py_DECREF(sep_obj);
12265 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012266 if (kind1 != kind)
12267 PyMem_Free(buf1);
12268 if (kind2 != kind)
12269 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012270
12271 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012272 onError:
12273 Py_DECREF(sep_obj);
12274 Py_DECREF(str_obj);
12275 if (kind1 != kind && buf1)
12276 PyMem_Free(buf1);
12277 if (kind2 != kind && buf2)
12278 PyMem_Free(buf2);
12279 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012280}
12281
12282PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012283 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012284\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012285Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012286the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012287found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012288
12289static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012290unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012291{
Victor Stinner9310abb2011-10-05 00:59:23 +020012292 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012293}
12294
12295PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012296 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012297\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012298Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012299the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012300separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012301
12302static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012303unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012304{
Victor Stinner9310abb2011-10-05 00:59:23 +020012305 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012306}
12307
Alexander Belopolsky40018472011-02-26 01:02:56 +000012308PyObject *
12309PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012310{
12311 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012312
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012313 s = PyUnicode_FromObject(s);
12314 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012315 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012316 if (sep != NULL) {
12317 sep = PyUnicode_FromObject(sep);
12318 if (sep == NULL) {
12319 Py_DECREF(s);
12320 return NULL;
12321 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012322 }
12323
Victor Stinner9310abb2011-10-05 00:59:23 +020012324 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012325
12326 Py_DECREF(s);
12327 Py_XDECREF(sep);
12328 return result;
12329}
12330
12331PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012332 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012333\n\
12334Return a list of the words in S, using sep as the\n\
12335delimiter string, starting at the end of the string and\n\
12336working to the front. If maxsplit is given, at most maxsplit\n\
12337splits are done. If sep is not specified, any whitespace string\n\
12338is a separator.");
12339
12340static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012341unicode_rsplit(PyObject *self, PyObject *args)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012342{
12343 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012344 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012345
Martin v. Löwis18e16552006-02-15 17:27:45 +000012346 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012347 return NULL;
12348
12349 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012350 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012351 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012352 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012353 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012354 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012355}
12356
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012357PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012358 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012359\n\
12360Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012361Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012362is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012363
12364static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012365unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012366{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012367 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012368 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012369
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012370 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12371 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012372 return NULL;
12373
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012374 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012375}
12376
12377static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012378PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012379{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012380 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012381}
12382
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012383PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012384 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012385\n\
12386Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012387and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012388
12389static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012390unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012391{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012392 return fixup(self, fixswapcase);
12393}
12394
Georg Brandlceee0772007-11-27 23:48:05 +000012395PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012396 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012397\n\
12398Return a translation table usable for str.translate().\n\
12399If there is only one argument, it must be a dictionary mapping Unicode\n\
12400ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012401Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012402If there are two arguments, they must be strings of equal length, and\n\
12403in the resulting dictionary, each character in x will be mapped to the\n\
12404character at the same position in y. If there is a third argument, it\n\
12405must be a string, whose characters will be mapped to None in the result.");
12406
12407static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012408unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012409{
12410 PyObject *x, *y = NULL, *z = NULL;
12411 PyObject *new = NULL, *key, *value;
12412 Py_ssize_t i = 0;
12413 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012414
Georg Brandlceee0772007-11-27 23:48:05 +000012415 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12416 return NULL;
12417 new = PyDict_New();
12418 if (!new)
12419 return NULL;
12420 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012421 int x_kind, y_kind, z_kind;
12422 void *x_data, *y_data, *z_data;
12423
Georg Brandlceee0772007-11-27 23:48:05 +000012424 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012425 if (!PyUnicode_Check(x)) {
12426 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12427 "be a string if there is a second argument");
12428 goto err;
12429 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012430 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012431 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12432 "arguments must have equal length");
12433 goto err;
12434 }
12435 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012436 x_kind = PyUnicode_KIND(x);
12437 y_kind = PyUnicode_KIND(y);
12438 x_data = PyUnicode_DATA(x);
12439 y_data = PyUnicode_DATA(y);
12440 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12441 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12442 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012443 if (!key || !value)
12444 goto err;
12445 res = PyDict_SetItem(new, key, value);
12446 Py_DECREF(key);
12447 Py_DECREF(value);
12448 if (res < 0)
12449 goto err;
12450 }
12451 /* create entries for deleting chars in z */
12452 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012453 z_kind = PyUnicode_KIND(z);
12454 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012455 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012456 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012457 if (!key)
12458 goto err;
12459 res = PyDict_SetItem(new, key, Py_None);
12460 Py_DECREF(key);
12461 if (res < 0)
12462 goto err;
12463 }
12464 }
12465 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012466 int kind;
12467 void *data;
12468
Georg Brandlceee0772007-11-27 23:48:05 +000012469 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012470 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012471 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12472 "to maketrans it must be a dict");
12473 goto err;
12474 }
12475 /* copy entries into the new dict, converting string keys to int keys */
12476 while (PyDict_Next(x, &i, &key, &value)) {
12477 if (PyUnicode_Check(key)) {
12478 /* convert string keys to integer keys */
12479 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012480 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012481 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12482 "table must be of length 1");
12483 goto err;
12484 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012485 kind = PyUnicode_KIND(key);
12486 data = PyUnicode_DATA(key);
12487 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012488 if (!newkey)
12489 goto err;
12490 res = PyDict_SetItem(new, newkey, value);
12491 Py_DECREF(newkey);
12492 if (res < 0)
12493 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012494 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012495 /* just keep integer keys */
12496 if (PyDict_SetItem(new, key, value) < 0)
12497 goto err;
12498 } else {
12499 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12500 "be strings or integers");
12501 goto err;
12502 }
12503 }
12504 }
12505 return new;
12506 err:
12507 Py_DECREF(new);
12508 return NULL;
12509}
12510
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012511PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012512 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012513\n\
12514Return a copy of the string S, where all characters have been mapped\n\
12515through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012516Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012517Unmapped characters are left untouched. Characters mapped to None\n\
12518are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012519
12520static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012521unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012522{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012523 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012524}
12525
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012526PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012527 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012528\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012529Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012530
12531static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012532unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012533{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012534 return fixup(self, fixupper);
12535}
12536
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012537PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012538 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012539\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012540Pad a numeric string S with zeros on the left, to fill a field\n\
12541of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012542
12543static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012544unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012545{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012546 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012547 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012548 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012549 int kind;
12550 void *data;
12551 Py_UCS4 chr;
12552
Martin v. Löwis18e16552006-02-15 17:27:45 +000012553 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012554 return NULL;
12555
Victor Stinnerc4b49542011-12-11 22:44:26 +010012556 if (PyUnicode_READY(self) < 0)
12557 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012558
Victor Stinnerc4b49542011-12-11 22:44:26 +010012559 if (PyUnicode_GET_LENGTH(self) >= width)
12560 return unicode_result_unchanged(self);
12561
12562 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012563
12564 u = pad(self, fill, 0, '0');
12565
Walter Dörwald068325e2002-04-15 13:36:47 +000012566 if (u == NULL)
12567 return NULL;
12568
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012569 kind = PyUnicode_KIND(u);
12570 data = PyUnicode_DATA(u);
12571 chr = PyUnicode_READ(kind, data, fill);
12572
12573 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012574 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012575 PyUnicode_WRITE(kind, data, 0, chr);
12576 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012577 }
12578
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012579 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012580 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012581}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012582
12583#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012584static PyObject *
12585unicode__decimal2ascii(PyObject *self)
12586{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012587 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012588}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012589#endif
12590
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012591PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012592 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012593\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012594Return True if S starts with the specified prefix, False otherwise.\n\
12595With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012596With optional end, stop comparing S at that position.\n\
12597prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012598
12599static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012600unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012601 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012602{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012603 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012604 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012605 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012606 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012607 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012608
Jesus Ceaac451502011-04-20 17:09:23 +020012609 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012610 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012611 if (PyTuple_Check(subobj)) {
12612 Py_ssize_t i;
12613 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012614 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012615 if (substring == NULL)
12616 return NULL;
12617 result = tailmatch(self, substring, start, end, -1);
12618 Py_DECREF(substring);
12619 if (result) {
12620 Py_RETURN_TRUE;
12621 }
12622 }
12623 /* nothing matched */
12624 Py_RETURN_FALSE;
12625 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012626 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012627 if (substring == NULL) {
12628 if (PyErr_ExceptionMatches(PyExc_TypeError))
12629 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12630 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012631 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012632 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012633 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012634 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012635 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012636}
12637
12638
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012639PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012640 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012641\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012642Return True if S ends with the specified suffix, False otherwise.\n\
12643With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012644With optional end, stop comparing S at that position.\n\
12645suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012646
12647static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012648unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012649 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012650{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012651 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012652 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012653 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012654 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012655 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012656
Jesus Ceaac451502011-04-20 17:09:23 +020012657 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012658 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012659 if (PyTuple_Check(subobj)) {
12660 Py_ssize_t i;
12661 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012662 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012663 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012664 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012665 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012666 result = tailmatch(self, substring, start, end, +1);
12667 Py_DECREF(substring);
12668 if (result) {
12669 Py_RETURN_TRUE;
12670 }
12671 }
12672 Py_RETURN_FALSE;
12673 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012674 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012675 if (substring == NULL) {
12676 if (PyErr_ExceptionMatches(PyExc_TypeError))
12677 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12678 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012679 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012680 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012681 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012682 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012683 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012684}
12685
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012686#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012687
12688PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012689 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012690\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012691Return a formatted version of S, using substitutions from args and kwargs.\n\
12692The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012693
Eric Smith27bbca62010-11-04 17:06:58 +000012694PyDoc_STRVAR(format_map__doc__,
12695 "S.format_map(mapping) -> str\n\
12696\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012697Return a formatted version of S, using substitutions from mapping.\n\
12698The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012699
Eric Smith4a7d76d2008-05-30 18:10:19 +000012700static PyObject *
12701unicode__format__(PyObject* self, PyObject* args)
12702{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012703 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012704
12705 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12706 return NULL;
12707
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012708 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012709 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012710 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012711}
12712
Eric Smith8c663262007-08-25 02:26:07 +000012713PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012714 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012715\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012716Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012717
12718static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012719unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012720{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012721 Py_ssize_t size;
12722
12723 /* If it's a compact object, account for base structure +
12724 character data. */
12725 if (PyUnicode_IS_COMPACT_ASCII(v))
12726 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12727 else if (PyUnicode_IS_COMPACT(v))
12728 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012729 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012730 else {
12731 /* If it is a two-block object, account for base object, and
12732 for character block if present. */
12733 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012734 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012735 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012736 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012737 }
12738 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012739 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012740 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012741 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012742 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012743 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012744
12745 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012746}
12747
12748PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012749 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012750
12751static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012752unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012753{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012754 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012755 if (!copy)
12756 return NULL;
12757 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012758}
12759
Guido van Rossumd57fd912000-03-10 22:53:23 +000012760static PyMethodDef unicode_methods[] = {
12761
12762 /* Order is according to common usage: often used methods should
12763 appear first, since lookup is done sequentially. */
12764
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012765 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012766 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12767 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012768 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012769 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12770 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12771 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12772 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12773 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12774 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12775 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012776 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012777 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12778 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12779 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012780 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012781 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12782 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12783 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012784 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012785 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012786 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012787 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012788 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12789 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12790 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12791 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12792 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12793 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12794 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12795 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12796 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12797 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12798 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12799 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12800 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12801 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012802 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012803 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012804 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012805 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012806 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012807 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012808 {"maketrans", (PyCFunction) unicode_maketrans,
12809 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012810 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012811#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012812 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012813#endif
12814
12815#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012816 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012817 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012818#endif
12819
Benjamin Peterson14339b62009-01-31 16:36:08 +000012820 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012821 {NULL, NULL}
12822};
12823
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012824static PyObject *
12825unicode_mod(PyObject *v, PyObject *w)
12826{
Brian Curtindfc80e32011-08-10 20:28:54 -050012827 if (!PyUnicode_Check(v))
12828 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012829 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012830}
12831
12832static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012833 0, /*nb_add*/
12834 0, /*nb_subtract*/
12835 0, /*nb_multiply*/
12836 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012837};
12838
Guido van Rossumd57fd912000-03-10 22:53:23 +000012839static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012840 (lenfunc) unicode_length, /* sq_length */
12841 PyUnicode_Concat, /* sq_concat */
12842 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12843 (ssizeargfunc) unicode_getitem, /* sq_item */
12844 0, /* sq_slice */
12845 0, /* sq_ass_item */
12846 0, /* sq_ass_slice */
12847 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012848};
12849
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012850static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012851unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012852{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012853 if (PyUnicode_READY(self) == -1)
12854 return NULL;
12855
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000012856 if (PyIndex_Check(item)) {
12857 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012858 if (i == -1 && PyErr_Occurred())
12859 return NULL;
12860 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012861 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012862 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012863 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000012864 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012865 PyObject *result;
12866 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012867 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012868 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012869
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012870 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000012871 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012872 return NULL;
12873 }
12874
12875 if (slicelength <= 0) {
Victor Stinner382955f2011-12-11 21:44:00 +010012876 Py_INCREF(unicode_empty);
12877 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012878 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010012879 slicelength == PyUnicode_GET_LENGTH(self)) {
12880 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000012881 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012882 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020012883 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012884 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012885 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012886 src_kind = PyUnicode_KIND(self);
12887 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020012888 if (!PyUnicode_IS_ASCII(self)) {
12889 kind_limit = kind_maxchar_limit(src_kind);
12890 max_char = 0;
12891 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12892 ch = PyUnicode_READ(src_kind, src_data, cur);
12893 if (ch > max_char) {
12894 max_char = ch;
12895 if (max_char >= kind_limit)
12896 break;
12897 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012898 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012899 }
Victor Stinner55c99112011-10-13 01:17:06 +020012900 else
12901 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012902 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012903 if (result == NULL)
12904 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012905 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012906 dest_data = PyUnicode_DATA(result);
12907
12908 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012909 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
12910 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012911 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012912 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012913 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012914 } else {
12915 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12916 return NULL;
12917 }
12918}
12919
12920static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012921 (lenfunc)unicode_length, /* mp_length */
12922 (binaryfunc)unicode_subscript, /* mp_subscript */
12923 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012924};
12925
Guido van Rossumd57fd912000-03-10 22:53:23 +000012926
Guido van Rossumd57fd912000-03-10 22:53:23 +000012927/* Helpers for PyUnicode_Format() */
12928
12929static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000012930getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012931{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012932 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012933 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012934 (*p_argidx)++;
12935 if (arglen < 0)
12936 return args;
12937 else
12938 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012939 }
12940 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012941 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012942 return NULL;
12943}
12944
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012945/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012946
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012947static PyObject *
12948formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012949{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012950 char *p;
12951 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012952 double x;
Tim Petersced69f82003-09-16 20:30:58 +000012953
Guido van Rossumd57fd912000-03-10 22:53:23 +000012954 x = PyFloat_AsDouble(v);
12955 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012956 return NULL;
12957
Guido van Rossumd57fd912000-03-10 22:53:23 +000012958 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012959 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000012960
Eric Smith0923d1d2009-04-16 20:16:10 +000012961 p = PyOS_double_to_string(x, type, prec,
12962 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012963 if (p == NULL)
12964 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012965 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000012966 PyMem_Free(p);
12967 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012968}
12969
Tim Peters38fd5b62000-09-21 05:43:11 +000012970static PyObject*
12971formatlong(PyObject *val, int flags, int prec, int type)
12972{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012973 char *buf;
12974 int len;
12975 PyObject *str; /* temporary string object. */
12976 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012977
Benjamin Peterson14339b62009-01-31 16:36:08 +000012978 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
12979 if (!str)
12980 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012981 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012982 Py_DECREF(str);
12983 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012984}
12985
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012986static Py_UCS4
12987formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012988{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012989 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012990 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012991 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012992 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000012993 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012994 goto onError;
12995 }
12996 else {
12997 /* Integer input truncated to a character */
12998 long x;
12999 x = PyLong_AsLong(v);
13000 if (x == -1 && PyErr_Occurred())
13001 goto onError;
13002
Victor Stinner8faf8212011-12-08 22:14:11 +010013003 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013004 PyErr_SetString(PyExc_OverflowError,
13005 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013006 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013007 }
13008
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013009 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013010 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013011
Benjamin Peterson29060642009-01-31 22:14:21 +000013012 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013013 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013014 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013015 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013016}
13017
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013018static int
13019repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count)
13020{
13021 int r;
13022 assert(count > 0);
13023 assert(PyUnicode_Check(obj));
13024 if (count > 5) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013025 PyObject *repeated = unicode_repeat(obj, count);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013026 if (repeated == NULL)
13027 return -1;
13028 r = _PyAccu_Accumulate(acc, repeated);
13029 Py_DECREF(repeated);
13030 return r;
13031 }
13032 else {
13033 do {
13034 if (_PyAccu_Accumulate(acc, obj))
13035 return -1;
13036 } while (--count);
13037 return 0;
13038 }
13039}
13040
Alexander Belopolsky40018472011-02-26 01:02:56 +000013041PyObject *
13042PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013043{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013044 void *fmt;
13045 int fmtkind;
13046 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013047 int kind;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013048 int r;
13049 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013050 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013051 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013052 PyObject *temp = NULL;
13053 PyObject *second = NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013054 PyObject *uformat;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013055 _PyAccu acc;
13056 static PyObject *plus, *minus, *blank, *zero, *percent;
13057
13058 if (!plus && !(plus = get_latin1_char('+')))
13059 return NULL;
13060 if (!minus && !(minus = get_latin1_char('-')))
13061 return NULL;
13062 if (!blank && !(blank = get_latin1_char(' ')))
13063 return NULL;
13064 if (!zero && !(zero = get_latin1_char('0')))
13065 return NULL;
13066 if (!percent && !(percent = get_latin1_char('%')))
13067 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000013068
Guido van Rossumd57fd912000-03-10 22:53:23 +000013069 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013070 PyErr_BadInternalCall();
13071 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013072 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013073 uformat = PyUnicode_FromObject(format);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013074 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013075 return NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013076 if (_PyAccu_Init(&acc))
13077 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013078 fmt = PyUnicode_DATA(uformat);
13079 fmtkind = PyUnicode_KIND(uformat);
13080 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13081 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013082
Guido van Rossumd57fd912000-03-10 22:53:23 +000013083 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013084 arglen = PyTuple_Size(args);
13085 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013086 }
13087 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013088 arglen = -1;
13089 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013090 }
Christian Heimes90aa7642007-12-19 02:45:37 +000013091 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000013092 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013093 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013094
13095 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013096 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013097 PyObject *nonfmt;
13098 Py_ssize_t nonfmtpos;
13099 nonfmtpos = fmtpos++;
13100 while (fmtcnt >= 0 &&
13101 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13102 fmtpos++;
13103 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013104 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013105 nonfmt = PyUnicode_Substring(uformat, nonfmtpos, fmtpos);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013106 if (nonfmt == NULL)
13107 goto onError;
13108 r = _PyAccu_Accumulate(&acc, nonfmt);
13109 Py_DECREF(nonfmt);
13110 if (r)
13111 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013112 }
13113 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013114 /* Got a format specifier */
13115 int flags = 0;
13116 Py_ssize_t width = -1;
13117 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013118 Py_UCS4 c = '\0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013119 Py_UCS4 fill, sign;
Benjamin Peterson29060642009-01-31 22:14:21 +000013120 int isnumok;
13121 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013122 void *pbuf = NULL;
13123 Py_ssize_t pindex, len;
13124 PyObject *signobj = NULL, *fillobj = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013125
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013126 fmtpos++;
13127 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
13128 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013129 Py_ssize_t keylen;
13130 PyObject *key;
13131 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013132
Benjamin Peterson29060642009-01-31 22:14:21 +000013133 if (dict == NULL) {
13134 PyErr_SetString(PyExc_TypeError,
13135 "format requires a mapping");
13136 goto onError;
13137 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013138 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013139 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013140 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013141 /* Skip over balanced parentheses */
13142 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013143 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013144 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013145 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013146 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013147 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013148 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013149 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013150 if (fmtcnt < 0 || pcount > 0) {
13151 PyErr_SetString(PyExc_ValueError,
13152 "incomplete format key");
13153 goto onError;
13154 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013155 key = PyUnicode_Substring(uformat,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013156 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013157 if (key == NULL)
13158 goto onError;
13159 if (args_owned) {
13160 Py_DECREF(args);
13161 args_owned = 0;
13162 }
13163 args = PyObject_GetItem(dict, key);
13164 Py_DECREF(key);
13165 if (args == NULL) {
13166 goto onError;
13167 }
13168 args_owned = 1;
13169 arglen = -1;
13170 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013171 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013172 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013173 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013174 case '-': flags |= F_LJUST; continue;
13175 case '+': flags |= F_SIGN; continue;
13176 case ' ': flags |= F_BLANK; continue;
13177 case '#': flags |= F_ALT; continue;
13178 case '0': flags |= F_ZERO; continue;
13179 }
13180 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013181 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013182 if (c == '*') {
13183 v = getnextarg(args, arglen, &argidx);
13184 if (v == NULL)
13185 goto onError;
13186 if (!PyLong_Check(v)) {
13187 PyErr_SetString(PyExc_TypeError,
13188 "* wants int");
13189 goto onError;
13190 }
13191 width = PyLong_AsLong(v);
13192 if (width == -1 && PyErr_Occurred())
13193 goto onError;
13194 if (width < 0) {
13195 flags |= F_LJUST;
13196 width = -width;
13197 }
13198 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013199 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013200 }
13201 else if (c >= '0' && c <= '9') {
13202 width = c - '0';
13203 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013204 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013205 if (c < '0' || c > '9')
13206 break;
13207 if ((width*10) / 10 != width) {
13208 PyErr_SetString(PyExc_ValueError,
13209 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013210 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013211 }
13212 width = width*10 + (c - '0');
13213 }
13214 }
13215 if (c == '.') {
13216 prec = 0;
13217 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013218 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013219 if (c == '*') {
13220 v = getnextarg(args, arglen, &argidx);
13221 if (v == NULL)
13222 goto onError;
13223 if (!PyLong_Check(v)) {
13224 PyErr_SetString(PyExc_TypeError,
13225 "* wants int");
13226 goto onError;
13227 }
13228 prec = PyLong_AsLong(v);
13229 if (prec == -1 && PyErr_Occurred())
13230 goto onError;
13231 if (prec < 0)
13232 prec = 0;
13233 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013234 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013235 }
13236 else if (c >= '0' && c <= '9') {
13237 prec = c - '0';
13238 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013239 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013240 if (c < '0' || c > '9')
13241 break;
13242 if ((prec*10) / 10 != prec) {
13243 PyErr_SetString(PyExc_ValueError,
13244 "prec too big");
13245 goto onError;
13246 }
13247 prec = prec*10 + (c - '0');
13248 }
13249 }
13250 } /* prec */
13251 if (fmtcnt >= 0) {
13252 if (c == 'h' || c == 'l' || c == 'L') {
13253 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013254 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013255 }
13256 }
13257 if (fmtcnt < 0) {
13258 PyErr_SetString(PyExc_ValueError,
13259 "incomplete format");
13260 goto onError;
13261 }
13262 if (c != '%') {
13263 v = getnextarg(args, arglen, &argidx);
13264 if (v == NULL)
13265 goto onError;
13266 }
13267 sign = 0;
13268 fill = ' ';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013269 fillobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013270 switch (c) {
13271
13272 case '%':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013273 _PyAccu_Accumulate(&acc, percent);
13274 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013275
13276 case 's':
13277 case 'r':
13278 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000013279 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013280 temp = v;
13281 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013282 }
13283 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013284 if (c == 's')
13285 temp = PyObject_Str(v);
13286 else if (c == 'r')
13287 temp = PyObject_Repr(v);
13288 else
13289 temp = PyObject_ASCII(v);
13290 if (temp == NULL)
13291 goto onError;
13292 if (PyUnicode_Check(temp))
13293 /* nothing to do */;
13294 else {
13295 Py_DECREF(temp);
13296 PyErr_SetString(PyExc_TypeError,
13297 "%s argument has non-string str()");
13298 goto onError;
13299 }
13300 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013301 if (PyUnicode_READY(temp) == -1) {
13302 Py_CLEAR(temp);
13303 goto onError;
13304 }
13305 pbuf = PyUnicode_DATA(temp);
13306 kind = PyUnicode_KIND(temp);
13307 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013308 if (prec >= 0 && len > prec)
13309 len = prec;
13310 break;
13311
13312 case 'i':
13313 case 'd':
13314 case 'u':
13315 case 'o':
13316 case 'x':
13317 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000013318 isnumok = 0;
13319 if (PyNumber_Check(v)) {
13320 PyObject *iobj=NULL;
13321
13322 if (PyLong_Check(v)) {
13323 iobj = v;
13324 Py_INCREF(iobj);
13325 }
13326 else {
13327 iobj = PyNumber_Long(v);
13328 }
13329 if (iobj!=NULL) {
13330 if (PyLong_Check(iobj)) {
13331 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013332 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013333 Py_DECREF(iobj);
13334 if (!temp)
13335 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013336 if (PyUnicode_READY(temp) == -1) {
13337 Py_CLEAR(temp);
13338 goto onError;
13339 }
13340 pbuf = PyUnicode_DATA(temp);
13341 kind = PyUnicode_KIND(temp);
13342 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013343 sign = 1;
13344 }
13345 else {
13346 Py_DECREF(iobj);
13347 }
13348 }
13349 }
13350 if (!isnumok) {
13351 PyErr_Format(PyExc_TypeError,
13352 "%%%c format: a number is required, "
13353 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13354 goto onError;
13355 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013356 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013357 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013358 fillobj = zero;
13359 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013360 break;
13361
13362 case 'e':
13363 case 'E':
13364 case 'f':
13365 case 'F':
13366 case 'g':
13367 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013368 temp = formatfloat(v, flags, prec, c);
13369 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000013370 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013371 if (PyUnicode_READY(temp) == -1) {
13372 Py_CLEAR(temp);
13373 goto onError;
13374 }
13375 pbuf = PyUnicode_DATA(temp);
13376 kind = PyUnicode_KIND(temp);
13377 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013378 sign = 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013379 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013380 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013381 fillobj = zero;
13382 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013383 break;
13384
13385 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013386 {
13387 Py_UCS4 ch = formatchar(v);
13388 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013389 goto onError;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013390 temp = _PyUnicode_FromUCS4(&ch, 1);
13391 if (temp == NULL)
13392 goto onError;
13393 pbuf = PyUnicode_DATA(temp);
13394 kind = PyUnicode_KIND(temp);
13395 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013396 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013397 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013398
13399 default:
13400 PyErr_Format(PyExc_ValueError,
13401 "unsupported format character '%c' (0x%x) "
13402 "at index %zd",
13403 (31<=c && c<=126) ? (char)c : '?',
13404 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013405 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013406 goto onError;
13407 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013408 /* pbuf is initialized here. */
13409 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013410 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013411 if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
13412 signobj = minus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013413 len--;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013414 pindex++;
13415 }
13416 else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
13417 signobj = plus;
13418 len--;
13419 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013420 }
13421 else if (flags & F_SIGN)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013422 signobj = plus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013423 else if (flags & F_BLANK)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013424 signobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013425 else
13426 sign = 0;
13427 }
13428 if (width < len)
13429 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013430 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013431 if (fill != ' ') {
13432 assert(signobj != NULL);
13433 if (_PyAccu_Accumulate(&acc, signobj))
13434 goto onError;
13435 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013436 if (width > len)
13437 width--;
13438 }
13439 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013440 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013441 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013442 if (fill != ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013443 second = get_latin1_char(
13444 PyUnicode_READ(kind, pbuf, pindex + 1));
13445 pindex += 2;
13446 if (second == NULL ||
13447 _PyAccu_Accumulate(&acc, zero) ||
13448 _PyAccu_Accumulate(&acc, second))
13449 goto onError;
13450 Py_CLEAR(second);
Benjamin Peterson29060642009-01-31 22:14:21 +000013451 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013452 width -= 2;
13453 if (width < 0)
13454 width = 0;
13455 len -= 2;
13456 }
13457 if (width > len && !(flags & F_LJUST)) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013458 assert(fillobj != NULL);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013459 if (repeat_accumulate(&acc, fillobj, width - len))
13460 goto onError;
13461 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013462 }
13463 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013464 if (sign) {
13465 assert(signobj != NULL);
13466 if (_PyAccu_Accumulate(&acc, signobj))
13467 goto onError;
13468 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013469 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013470 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13471 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013472 second = get_latin1_char(
13473 PyUnicode_READ(kind, pbuf, pindex + 1));
13474 pindex += 2;
13475 if (second == NULL ||
13476 _PyAccu_Accumulate(&acc, zero) ||
13477 _PyAccu_Accumulate(&acc, second))
13478 goto onError;
13479 Py_CLEAR(second);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013480 }
13481 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013482 /* Copy all characters, preserving len */
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013483 if (temp != NULL) {
13484 assert(pbuf == PyUnicode_DATA(temp));
13485 v = PyUnicode_Substring(temp, pindex, pindex + len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013486 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013487 else {
13488 const char *p = (const char *) pbuf;
13489 assert(pbuf != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013490 p += kind * pindex;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013491 v = PyUnicode_FromKindAndData(kind, p, len);
13492 }
13493 if (v == NULL)
13494 goto onError;
13495 r = _PyAccu_Accumulate(&acc, v);
13496 Py_DECREF(v);
13497 if (r)
13498 goto onError;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013499 if (width > len && repeat_accumulate(&acc, blank, width - len))
13500 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013501 if (dict && (argidx < arglen) && c != '%') {
13502 PyErr_SetString(PyExc_TypeError,
13503 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013504 goto onError;
13505 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013506 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013507 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013508 } /* until end */
13509 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013510 PyErr_SetString(PyExc_TypeError,
13511 "not all arguments converted during string formatting");
13512 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013513 }
13514
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013515 result = _PyAccu_Finish(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013516 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013517 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013518 }
13519 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013520 Py_XDECREF(temp);
13521 Py_XDECREF(second);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013522 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013523
Benjamin Peterson29060642009-01-31 22:14:21 +000013524 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013525 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013526 Py_XDECREF(temp);
13527 Py_XDECREF(second);
13528 _PyAccu_Destroy(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013529 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013530 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013531 }
13532 return NULL;
13533}
13534
Jeremy Hylton938ace62002-07-17 16:30:39 +000013535static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013536unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13537
Tim Peters6d6c1a32001-08-02 04:15:00 +000013538static PyObject *
13539unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13540{
Benjamin Peterson29060642009-01-31 22:14:21 +000013541 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013542 static char *kwlist[] = {"object", "encoding", "errors", 0};
13543 char *encoding = NULL;
13544 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013545
Benjamin Peterson14339b62009-01-31 16:36:08 +000013546 if (type != &PyUnicode_Type)
13547 return unicode_subtype_new(type, args, kwds);
13548 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013549 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013550 return NULL;
Victor Stinner382955f2011-12-11 21:44:00 +010013551 if (x == NULL) {
13552 Py_INCREF(unicode_empty);
13553 return unicode_empty;
13554 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013555 if (encoding == NULL && errors == NULL)
13556 return PyObject_Str(x);
13557 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013558 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013559}
13560
Guido van Rossume023fe02001-08-30 03:12:59 +000013561static PyObject *
13562unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13563{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013564 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013565 Py_ssize_t length, char_size;
13566 int share_wstr, share_utf8;
13567 unsigned int kind;
13568 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013569
Benjamin Peterson14339b62009-01-31 16:36:08 +000013570 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013571
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013572 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013573 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013574 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013575 assert(_PyUnicode_CHECK(unicode));
Victor Stinnere06e1452011-10-04 20:52:31 +020013576 if (PyUnicode_READY(unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013577 return NULL;
13578
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013579 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013580 if (self == NULL) {
13581 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013582 return NULL;
13583 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013584 kind = PyUnicode_KIND(unicode);
13585 length = PyUnicode_GET_LENGTH(unicode);
13586
13587 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013588#ifdef Py_DEBUG
13589 _PyUnicode_HASH(self) = -1;
13590#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013591 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013592#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013593 _PyUnicode_STATE(self).interned = 0;
13594 _PyUnicode_STATE(self).kind = kind;
13595 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013596 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013597 _PyUnicode_STATE(self).ready = 1;
13598 _PyUnicode_WSTR(self) = NULL;
13599 _PyUnicode_UTF8_LENGTH(self) = 0;
13600 _PyUnicode_UTF8(self) = NULL;
13601 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013602 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013603
13604 share_utf8 = 0;
13605 share_wstr = 0;
13606 if (kind == PyUnicode_1BYTE_KIND) {
13607 char_size = 1;
13608 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13609 share_utf8 = 1;
13610 }
13611 else if (kind == PyUnicode_2BYTE_KIND) {
13612 char_size = 2;
13613 if (sizeof(wchar_t) == 2)
13614 share_wstr = 1;
13615 }
13616 else {
13617 assert(kind == PyUnicode_4BYTE_KIND);
13618 char_size = 4;
13619 if (sizeof(wchar_t) == 4)
13620 share_wstr = 1;
13621 }
13622
13623 /* Ensure we won't overflow the length. */
13624 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13625 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013626 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013627 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013628 data = PyObject_MALLOC((length + 1) * char_size);
13629 if (data == NULL) {
13630 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013631 goto onError;
13632 }
13633
Victor Stinnerc3c74152011-10-02 20:39:55 +020013634 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013635 if (share_utf8) {
13636 _PyUnicode_UTF8_LENGTH(self) = length;
13637 _PyUnicode_UTF8(self) = data;
13638 }
13639 if (share_wstr) {
13640 _PyUnicode_WSTR_LENGTH(self) = length;
13641 _PyUnicode_WSTR(self) = (wchar_t *)data;
13642 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013643
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013644 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013645 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013646 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013647#ifdef Py_DEBUG
13648 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13649#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020013650 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013651 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013652
13653onError:
13654 Py_DECREF(unicode);
13655 Py_DECREF(self);
13656 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013657}
13658
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013659PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013660 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013661\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013662Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013663encoding defaults to the current default string encoding.\n\
13664errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013665
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013666static PyObject *unicode_iter(PyObject *seq);
13667
Guido van Rossumd57fd912000-03-10 22:53:23 +000013668PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013669 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013670 "str", /* tp_name */
13671 sizeof(PyUnicodeObject), /* tp_size */
13672 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013673 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013674 (destructor)unicode_dealloc, /* tp_dealloc */
13675 0, /* tp_print */
13676 0, /* tp_getattr */
13677 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013678 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013679 unicode_repr, /* tp_repr */
13680 &unicode_as_number, /* tp_as_number */
13681 &unicode_as_sequence, /* tp_as_sequence */
13682 &unicode_as_mapping, /* tp_as_mapping */
13683 (hashfunc) unicode_hash, /* tp_hash*/
13684 0, /* tp_call*/
13685 (reprfunc) unicode_str, /* tp_str */
13686 PyObject_GenericGetAttr, /* tp_getattro */
13687 0, /* tp_setattro */
13688 0, /* tp_as_buffer */
13689 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013690 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013691 unicode_doc, /* tp_doc */
13692 0, /* tp_traverse */
13693 0, /* tp_clear */
13694 PyUnicode_RichCompare, /* tp_richcompare */
13695 0, /* tp_weaklistoffset */
13696 unicode_iter, /* tp_iter */
13697 0, /* tp_iternext */
13698 unicode_methods, /* tp_methods */
13699 0, /* tp_members */
13700 0, /* tp_getset */
13701 &PyBaseObject_Type, /* tp_base */
13702 0, /* tp_dict */
13703 0, /* tp_descr_get */
13704 0, /* tp_descr_set */
13705 0, /* tp_dictoffset */
13706 0, /* tp_init */
13707 0, /* tp_alloc */
13708 unicode_new, /* tp_new */
13709 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013710};
13711
13712/* Initialize the Unicode implementation */
13713
Victor Stinner3a50e702011-10-18 21:21:00 +020013714int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013715{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013716 int i;
13717
Thomas Wouters477c8d52006-05-27 19:21:47 +000013718 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013719 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013720 0x000A, /* LINE FEED */
13721 0x000D, /* CARRIAGE RETURN */
13722 0x001C, /* FILE SEPARATOR */
13723 0x001D, /* GROUP SEPARATOR */
13724 0x001E, /* RECORD SEPARATOR */
13725 0x0085, /* NEXT LINE */
13726 0x2028, /* LINE SEPARATOR */
13727 0x2029, /* PARAGRAPH SEPARATOR */
13728 };
13729
Fred Drakee4315f52000-05-09 19:53:39 +000013730 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013731 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013732 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013733 Py_FatalError("Can't create empty string");
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010013734 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013735
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013736 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013737 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013738 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013739 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013740
13741 /* initialize the linebreak bloom filter */
13742 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013743 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013744 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013745
13746 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020013747
13748#ifdef HAVE_MBCS
13749 winver.dwOSVersionInfoSize = sizeof(winver);
13750 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
13751 PyErr_SetFromWindowsErr(0);
13752 return -1;
13753 }
13754#endif
13755 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013756}
13757
13758/* Finalize the Unicode implementation */
13759
Christian Heimesa156e092008-02-16 07:38:31 +000013760int
13761PyUnicode_ClearFreeList(void)
13762{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013763 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013764}
13765
Guido van Rossumd57fd912000-03-10 22:53:23 +000013766void
Thomas Wouters78890102000-07-22 19:25:51 +000013767_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013768{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013769 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013770
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013771 Py_XDECREF(unicode_empty);
13772 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013773
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013774 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013775 if (unicode_latin1[i]) {
13776 Py_DECREF(unicode_latin1[i]);
13777 unicode_latin1[i] = NULL;
13778 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013779 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020013780 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000013781 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013782}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013783
Walter Dörwald16807132007-05-25 13:52:07 +000013784void
13785PyUnicode_InternInPlace(PyObject **p)
13786{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013787 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013788 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013789#ifdef Py_DEBUG
13790 assert(s != NULL);
13791 assert(_PyUnicode_CHECK(s));
13792#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013793 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013794 return;
13795#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013796 /* If it's a subclass, we don't really know what putting
13797 it in the interned dict might do. */
13798 if (!PyUnicode_CheckExact(s))
13799 return;
13800 if (PyUnicode_CHECK_INTERNED(s))
13801 return;
13802 if (interned == NULL) {
13803 interned = PyDict_New();
13804 if (interned == NULL) {
13805 PyErr_Clear(); /* Don't leave an exception */
13806 return;
13807 }
13808 }
13809 /* It might be that the GetItem call fails even
13810 though the key is present in the dictionary,
13811 namely when this happens during a stack overflow. */
13812 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010013813 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013814 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000013815
Benjamin Peterson29060642009-01-31 22:14:21 +000013816 if (t) {
13817 Py_INCREF(t);
13818 Py_DECREF(*p);
13819 *p = t;
13820 return;
13821 }
Walter Dörwald16807132007-05-25 13:52:07 +000013822
Benjamin Peterson14339b62009-01-31 16:36:08 +000013823 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010013824 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013825 PyErr_Clear();
13826 PyThreadState_GET()->recursion_critical = 0;
13827 return;
13828 }
13829 PyThreadState_GET()->recursion_critical = 0;
13830 /* The two references in interned are not counted by refcnt.
13831 The deallocator will take care of this */
13832 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013833 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013834}
13835
13836void
13837PyUnicode_InternImmortal(PyObject **p)
13838{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013839 PyUnicode_InternInPlace(p);
13840 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020013841 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013842 Py_INCREF(*p);
13843 }
Walter Dörwald16807132007-05-25 13:52:07 +000013844}
13845
13846PyObject *
13847PyUnicode_InternFromString(const char *cp)
13848{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013849 PyObject *s = PyUnicode_FromString(cp);
13850 if (s == NULL)
13851 return NULL;
13852 PyUnicode_InternInPlace(&s);
13853 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000013854}
13855
Alexander Belopolsky40018472011-02-26 01:02:56 +000013856void
13857_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000013858{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013859 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013860 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013861 Py_ssize_t i, n;
13862 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000013863
Benjamin Peterson14339b62009-01-31 16:36:08 +000013864 if (interned == NULL || !PyDict_Check(interned))
13865 return;
13866 keys = PyDict_Keys(interned);
13867 if (keys == NULL || !PyList_Check(keys)) {
13868 PyErr_Clear();
13869 return;
13870 }
Walter Dörwald16807132007-05-25 13:52:07 +000013871
Benjamin Peterson14339b62009-01-31 16:36:08 +000013872 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13873 detector, interned unicode strings are not forcibly deallocated;
13874 rather, we give them their stolen references back, and then clear
13875 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000013876
Benjamin Peterson14339b62009-01-31 16:36:08 +000013877 n = PyList_GET_SIZE(keys);
13878 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000013879 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013880 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013881 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013882 if (PyUnicode_READY(s) == -1) {
13883 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013884 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013885 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013886 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013887 case SSTATE_NOT_INTERNED:
13888 /* XXX Shouldn't happen */
13889 break;
13890 case SSTATE_INTERNED_IMMORTAL:
13891 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013892 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013893 break;
13894 case SSTATE_INTERNED_MORTAL:
13895 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013896 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013897 break;
13898 default:
13899 Py_FatalError("Inconsistent interned string state.");
13900 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013901 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013902 }
13903 fprintf(stderr, "total size of all interned strings: "
13904 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13905 "mortal/immortal\n", mortal_size, immortal_size);
13906 Py_DECREF(keys);
13907 PyDict_Clear(interned);
13908 Py_DECREF(interned);
13909 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000013910}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013911
13912
13913/********************* Unicode Iterator **************************/
13914
13915typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013916 PyObject_HEAD
13917 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013918 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013919} unicodeiterobject;
13920
13921static void
13922unicodeiter_dealloc(unicodeiterobject *it)
13923{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013924 _PyObject_GC_UNTRACK(it);
13925 Py_XDECREF(it->it_seq);
13926 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013927}
13928
13929static int
13930unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13931{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013932 Py_VISIT(it->it_seq);
13933 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013934}
13935
13936static PyObject *
13937unicodeiter_next(unicodeiterobject *it)
13938{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013939 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013940
Benjamin Peterson14339b62009-01-31 16:36:08 +000013941 assert(it != NULL);
13942 seq = it->it_seq;
13943 if (seq == NULL)
13944 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013945 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013946
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013947 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
13948 int kind = PyUnicode_KIND(seq);
13949 void *data = PyUnicode_DATA(seq);
13950 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
13951 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013952 if (item != NULL)
13953 ++it->it_index;
13954 return item;
13955 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013956
Benjamin Peterson14339b62009-01-31 16:36:08 +000013957 Py_DECREF(seq);
13958 it->it_seq = NULL;
13959 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013960}
13961
13962static PyObject *
13963unicodeiter_len(unicodeiterobject *it)
13964{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013965 Py_ssize_t len = 0;
13966 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013967 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013968 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013969}
13970
13971PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
13972
13973static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013974 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000013975 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000013976 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013977};
13978
13979PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013980 PyVarObject_HEAD_INIT(&PyType_Type, 0)
13981 "str_iterator", /* tp_name */
13982 sizeof(unicodeiterobject), /* tp_basicsize */
13983 0, /* tp_itemsize */
13984 /* methods */
13985 (destructor)unicodeiter_dealloc, /* tp_dealloc */
13986 0, /* tp_print */
13987 0, /* tp_getattr */
13988 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013989 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013990 0, /* tp_repr */
13991 0, /* tp_as_number */
13992 0, /* tp_as_sequence */
13993 0, /* tp_as_mapping */
13994 0, /* tp_hash */
13995 0, /* tp_call */
13996 0, /* tp_str */
13997 PyObject_GenericGetAttr, /* tp_getattro */
13998 0, /* tp_setattro */
13999 0, /* tp_as_buffer */
14000 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14001 0, /* tp_doc */
14002 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14003 0, /* tp_clear */
14004 0, /* tp_richcompare */
14005 0, /* tp_weaklistoffset */
14006 PyObject_SelfIter, /* tp_iter */
14007 (iternextfunc)unicodeiter_next, /* tp_iternext */
14008 unicodeiter_methods, /* tp_methods */
14009 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014010};
14011
14012static PyObject *
14013unicode_iter(PyObject *seq)
14014{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014015 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014016
Benjamin Peterson14339b62009-01-31 16:36:08 +000014017 if (!PyUnicode_Check(seq)) {
14018 PyErr_BadInternalCall();
14019 return NULL;
14020 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014021 if (PyUnicode_READY(seq) == -1)
14022 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014023 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14024 if (it == NULL)
14025 return NULL;
14026 it->it_index = 0;
14027 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014028 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014029 _PyObject_GC_TRACK(it);
14030 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014031}
14032
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014033
14034size_t
14035Py_UNICODE_strlen(const Py_UNICODE *u)
14036{
14037 int res = 0;
14038 while(*u++)
14039 res++;
14040 return res;
14041}
14042
14043Py_UNICODE*
14044Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14045{
14046 Py_UNICODE *u = s1;
14047 while ((*u++ = *s2++));
14048 return s1;
14049}
14050
14051Py_UNICODE*
14052Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14053{
14054 Py_UNICODE *u = s1;
14055 while ((*u++ = *s2++))
14056 if (n-- == 0)
14057 break;
14058 return s1;
14059}
14060
14061Py_UNICODE*
14062Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14063{
14064 Py_UNICODE *u1 = s1;
14065 u1 += Py_UNICODE_strlen(u1);
14066 Py_UNICODE_strcpy(u1, s2);
14067 return s1;
14068}
14069
14070int
14071Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14072{
14073 while (*s1 && *s2 && *s1 == *s2)
14074 s1++, s2++;
14075 if (*s1 && *s2)
14076 return (*s1 < *s2) ? -1 : +1;
14077 if (*s1)
14078 return 1;
14079 if (*s2)
14080 return -1;
14081 return 0;
14082}
14083
14084int
14085Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14086{
14087 register Py_UNICODE u1, u2;
14088 for (; n != 0; n--) {
14089 u1 = *s1;
14090 u2 = *s2;
14091 if (u1 != u2)
14092 return (u1 < u2) ? -1 : +1;
14093 if (u1 == '\0')
14094 return 0;
14095 s1++;
14096 s2++;
14097 }
14098 return 0;
14099}
14100
14101Py_UNICODE*
14102Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14103{
14104 const Py_UNICODE *p;
14105 for (p = s; *p; p++)
14106 if (*p == c)
14107 return (Py_UNICODE*)p;
14108 return NULL;
14109}
14110
14111Py_UNICODE*
14112Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14113{
14114 const Py_UNICODE *p;
14115 p = s + Py_UNICODE_strlen(s);
14116 while (p != s) {
14117 p--;
14118 if (*p == c)
14119 return (Py_UNICODE*)p;
14120 }
14121 return NULL;
14122}
Victor Stinner331ea922010-08-10 16:37:20 +000014123
Victor Stinner71133ff2010-09-01 23:43:53 +000014124Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014125PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014126{
Victor Stinner577db2c2011-10-11 22:12:48 +020014127 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014128 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014129
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014130 if (!PyUnicode_Check(unicode)) {
14131 PyErr_BadArgument();
14132 return NULL;
14133 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014134 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014135 if (u == NULL)
14136 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014137 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014138 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014139 PyErr_NoMemory();
14140 return NULL;
14141 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014142 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014143 size *= sizeof(Py_UNICODE);
14144 copy = PyMem_Malloc(size);
14145 if (copy == NULL) {
14146 PyErr_NoMemory();
14147 return NULL;
14148 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014149 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014150 return copy;
14151}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014152
Georg Brandl66c221e2010-10-14 07:04:07 +000014153/* A _string module, to export formatter_parser and formatter_field_name_split
14154 to the string.Formatter class implemented in Python. */
14155
14156static PyMethodDef _string_methods[] = {
14157 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14158 METH_O, PyDoc_STR("split the argument as a field name")},
14159 {"formatter_parser", (PyCFunction) formatter_parser,
14160 METH_O, PyDoc_STR("parse the argument as a format string")},
14161 {NULL, NULL}
14162};
14163
14164static struct PyModuleDef _string_module = {
14165 PyModuleDef_HEAD_INIT,
14166 "_string",
14167 PyDoc_STR("string helper module"),
14168 0,
14169 _string_methods,
14170 NULL,
14171 NULL,
14172 NULL,
14173 NULL
14174};
14175
14176PyMODINIT_FUNC
14177PyInit__string(void)
14178{
14179 return PyModule_Create(&_string_module);
14180}
14181
14182
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014183#ifdef __cplusplus
14184}
14185#endif