blob: a2c3227df0b136eb0d83a6ab5c3a3ef3c3c93726 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Guido van Rossumd57fd912000-03-10 22:53:23 +000049/* Endianness switches; defaults to little endian */
50
51#ifdef WORDS_BIGENDIAN
52# define BYTEORDER_IS_BIG_ENDIAN
53#else
54# define BYTEORDER_IS_LITTLE_ENDIAN
55#endif
56
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000057/* --- Globals ------------------------------------------------------------
58
59 The globals are initialized by the _PyUnicode_Init() API and should
60 not be used before calling that API.
61
62*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000063
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000064
65#ifdef __cplusplus
66extern "C" {
67#endif
68
Victor Stinner8faf8212011-12-08 22:14:11 +010069/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
70#define MAX_UNICODE 0x10ffff
71
Victor Stinner910337b2011-10-03 03:20:16 +020072#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020073# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020074#else
75# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
76#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020077
Victor Stinnere90fe6a2011-10-01 16:48:13 +020078#define _PyUnicode_UTF8(op) \
79 (((PyCompactUnicodeObject*)(op))->utf8)
80#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020081 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020082 assert(PyUnicode_IS_READY(op)), \
83 PyUnicode_IS_COMPACT_ASCII(op) ? \
84 ((char*)((PyASCIIObject*)(op) + 1)) : \
85 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020086#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020087 (((PyCompactUnicodeObject*)(op))->utf8_length)
88#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020089 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020090 assert(PyUnicode_IS_READY(op)), \
91 PyUnicode_IS_COMPACT_ASCII(op) ? \
92 ((PyASCIIObject*)(op))->length : \
93 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020094#define _PyUnicode_WSTR(op) \
95 (((PyASCIIObject*)(op))->wstr)
96#define _PyUnicode_WSTR_LENGTH(op) \
97 (((PyCompactUnicodeObject*)(op))->wstr_length)
98#define _PyUnicode_LENGTH(op) \
99 (((PyASCIIObject *)(op))->length)
100#define _PyUnicode_STATE(op) \
101 (((PyASCIIObject *)(op))->state)
102#define _PyUnicode_HASH(op) \
103 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200104#define _PyUnicode_KIND(op) \
105 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200107#define _PyUnicode_GET_LENGTH(op) \
108 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200109 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200110#define _PyUnicode_DATA_ANY(op) \
111 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200112
Victor Stinner910337b2011-10-03 03:20:16 +0200113#undef PyUnicode_READY
114#define PyUnicode_READY(op) \
115 (assert(_PyUnicode_CHECK(op)), \
116 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200117 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100118 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200119
Victor Stinnerc379ead2011-10-03 12:52:27 +0200120#define _PyUnicode_SHARE_UTF8(op) \
121 (assert(_PyUnicode_CHECK(op)), \
122 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
123 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
124#define _PyUnicode_SHARE_WSTR(op) \
125 (assert(_PyUnicode_CHECK(op)), \
126 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
127
Victor Stinner829c0ad2011-10-03 01:08:02 +0200128/* true if the Unicode object has an allocated UTF-8 memory block
129 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200130#define _PyUnicode_HAS_UTF8_MEMORY(op) \
131 (assert(_PyUnicode_CHECK(op)), \
132 (!PyUnicode_IS_COMPACT_ASCII(op) \
133 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200134 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
135
Victor Stinner03490912011-10-03 23:45:12 +0200136/* true if the Unicode object has an allocated wstr memory block
137 (not shared with other data) */
138#define _PyUnicode_HAS_WSTR_MEMORY(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 (_PyUnicode_WSTR(op) && \
141 (!PyUnicode_IS_READY(op) || \
142 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
143
Victor Stinner910337b2011-10-03 03:20:16 +0200144/* Generic helper macro to convert characters of different types.
145 from_type and to_type have to be valid type names, begin and end
146 are pointers to the source characters which should be of type
147 "from_type *". to is a pointer of type "to_type *" and points to the
148 buffer where the result characters are written to. */
149#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
150 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200151 to_type *_to = (to_type *) to; \
152 const from_type *_iter = (begin); \
153 const from_type *_end = (end); \
154 Py_ssize_t n = (_end) - (_iter); \
155 const from_type *_unrolled_end = \
156 _iter + (n & ~ (Py_ssize_t) 3); \
157 while (_iter < (_unrolled_end)) { \
158 _to[0] = (to_type) _iter[0]; \
159 _to[1] = (to_type) _iter[1]; \
160 _to[2] = (to_type) _iter[2]; \
161 _to[3] = (to_type) _iter[3]; \
162 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200163 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200164 while (_iter < (_end)) \
165 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200166 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200167
Walter Dörwald16807132007-05-25 13:52:07 +0000168/* This dictionary holds all interned unicode strings. Note that references
169 to strings in this dictionary are *not* counted in the string's ob_refcnt.
170 When the interned string reaches a refcnt of 0 the string deallocation
171 function will delete the reference from this dictionary.
172
173 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000174 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000175*/
176static PyObject *interned;
177
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000178/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200179static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000180
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200181/* List of static strings. */
182static _Py_Identifier *static_strings;
183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184/* Single character Unicode strings in the Latin-1 range are being
185 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200186static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000187
Christian Heimes190d79e2008-01-30 11:58:22 +0000188/* Fast detection of the most frequent whitespace characters */
189const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000190 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000191/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000192/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000193/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000194/* case 0x000C: * FORM FEED */
195/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000196 0, 1, 1, 1, 1, 1, 0, 0,
197 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000198/* case 0x001C: * FILE SEPARATOR */
199/* case 0x001D: * GROUP SEPARATOR */
200/* case 0x001E: * RECORD SEPARATOR */
201/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000202 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000203/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000204 1, 0, 0, 0, 0, 0, 0, 0,
205 0, 0, 0, 0, 0, 0, 0, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
207 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000208
Benjamin Peterson14339b62009-01-31 16:36:08 +0000209 0, 0, 0, 0, 0, 0, 0, 0,
210 0, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000217};
218
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200219/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200220static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200221static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200222static void copy_characters(
223 PyObject *to, Py_ssize_t to_start,
224 PyObject *from, Py_ssize_t from_start,
225 Py_ssize_t how_many);
Victor Stinner488fa492011-12-12 00:01:39 +0100226static int unicode_modifiable(PyObject *unicode);
227
Victor Stinnerfe226c02011-10-03 03:52:20 +0200228
Alexander Belopolsky40018472011-02-26 01:02:56 +0000229static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200230unicode_fromascii(const unsigned char *s, Py_ssize_t size);
231static PyObject *
232_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
233static PyObject *
234_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
235static PyObject *
236_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
237
238static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000239unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000240 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100241 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000242 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
243
Alexander Belopolsky40018472011-02-26 01:02:56 +0000244static void
245raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300246 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100247 PyObject *unicode,
248 Py_ssize_t startpos, Py_ssize_t endpos,
249 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000250
Christian Heimes190d79e2008-01-30 11:58:22 +0000251/* Same for linebreaks */
252static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000253 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000254/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000255/* 0x000B, * LINE TABULATION */
256/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000257/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000258 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000259 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000260/* 0x001C, * FILE SEPARATOR */
261/* 0x001D, * GROUP SEPARATOR */
262/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000263 0, 0, 0, 0, 1, 1, 1, 0,
264 0, 0, 0, 0, 0, 0, 0, 0,
265 0, 0, 0, 0, 0, 0, 0, 0,
266 0, 0, 0, 0, 0, 0, 0, 0,
267 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000268
Benjamin Peterson14339b62009-01-31 16:36:08 +0000269 0, 0, 0, 0, 0, 0, 0, 0,
270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000277};
278
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300279/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
280 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000281Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000282PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000283{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000284#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000285 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000286#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000287 /* This is actually an illegal character, so it should
288 not be passed to unichr. */
289 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000290#endif
291}
292
Victor Stinner910337b2011-10-03 03:20:16 +0200293#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200294int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100295_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200296{
297 PyASCIIObject *ascii;
298 unsigned int kind;
299
300 assert(PyUnicode_Check(op));
301
302 ascii = (PyASCIIObject *)op;
303 kind = ascii->state.kind;
304
Victor Stinnera3b334d2011-10-03 13:53:37 +0200305 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200306 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200307 assert(ascii->state.ready == 1);
308 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200309 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200310 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200311 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200312
Victor Stinnera41463c2011-10-04 01:05:08 +0200313 if (ascii->state.compact == 1) {
314 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200315 assert(kind == PyUnicode_1BYTE_KIND
316 || kind == PyUnicode_2BYTE_KIND
317 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200318 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200319 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200320 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100321 }
322 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200323 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
324
325 data = unicode->data.any;
326 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100327 assert(ascii->length == 0);
328 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200329 assert(ascii->state.compact == 0);
330 assert(ascii->state.ascii == 0);
331 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100332 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200333 assert(ascii->wstr != NULL);
334 assert(data == NULL);
335 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200336 }
337 else {
338 assert(kind == PyUnicode_1BYTE_KIND
339 || kind == PyUnicode_2BYTE_KIND
340 || kind == PyUnicode_4BYTE_KIND);
341 assert(ascii->state.compact == 0);
342 assert(ascii->state.ready == 1);
343 assert(data != NULL);
344 if (ascii->state.ascii) {
345 assert (compact->utf8 == data);
346 assert (compact->utf8_length == ascii->length);
347 }
348 else
349 assert (compact->utf8 != data);
350 }
351 }
352 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200353 if (
354#if SIZEOF_WCHAR_T == 2
355 kind == PyUnicode_2BYTE_KIND
356#else
357 kind == PyUnicode_4BYTE_KIND
358#endif
359 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200360 {
361 assert(ascii->wstr == data);
362 assert(compact->wstr_length == ascii->length);
363 } else
364 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200365 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200366
367 if (compact->utf8 == NULL)
368 assert(compact->utf8_length == 0);
369 if (ascii->wstr == NULL)
370 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200371 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200372 /* check that the best kind is used */
373 if (check_content && kind != PyUnicode_WCHAR_KIND)
374 {
375 Py_ssize_t i;
376 Py_UCS4 maxchar = 0;
377 void *data = PyUnicode_DATA(ascii);
378 for (i=0; i < ascii->length; i++)
379 {
380 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
381 if (ch > maxchar)
382 maxchar = ch;
383 }
384 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100385 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200386 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100387 assert(maxchar <= 255);
388 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200389 else
390 assert(maxchar < 128);
391 }
Victor Stinner77faf692011-11-20 18:56:05 +0100392 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200393 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100394 assert(maxchar <= 0xFFFF);
395 }
396 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200397 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100398 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100399 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200400 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400401 return 1;
402}
Victor Stinner910337b2011-10-03 03:20:16 +0200403#endif
404
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100405static PyObject*
406unicode_result_wchar(PyObject *unicode)
407{
408#ifndef Py_DEBUG
409 Py_ssize_t len;
410
411 assert(Py_REFCNT(unicode) == 1);
412
413 len = _PyUnicode_WSTR_LENGTH(unicode);
414 if (len == 0) {
415 Py_INCREF(unicode_empty);
416 Py_DECREF(unicode);
417 return unicode_empty;
418 }
419
420 if (len == 1) {
421 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
422 if (ch < 256) {
423 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
424 Py_DECREF(unicode);
425 return latin1_char;
426 }
427 }
428
429 if (_PyUnicode_Ready(unicode) < 0) {
430 Py_XDECREF(unicode);
431 return NULL;
432 }
433#else
434 /* don't make the result ready in debug mode to ensure that the caller
435 makes the string ready before using it */
436 assert(_PyUnicode_CheckConsistency(unicode, 1));
437#endif
438 return unicode;
439}
440
441static PyObject*
442unicode_result_ready(PyObject *unicode)
443{
444 Py_ssize_t length;
445
446 length = PyUnicode_GET_LENGTH(unicode);
447 if (length == 0) {
448 if (unicode != unicode_empty) {
449 Py_INCREF(unicode_empty);
450 Py_DECREF(unicode);
451 }
452 return unicode_empty;
453 }
454
455 if (length == 1) {
456 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
457 if (ch < 256) {
458 PyObject *latin1_char = unicode_latin1[ch];
459 if (latin1_char != NULL) {
460 if (unicode != latin1_char) {
461 Py_INCREF(latin1_char);
462 Py_DECREF(unicode);
463 }
464 return latin1_char;
465 }
466 else {
467 assert(_PyUnicode_CheckConsistency(unicode, 1));
468 Py_INCREF(unicode);
469 unicode_latin1[ch] = unicode;
470 return unicode;
471 }
472 }
473 }
474
475 assert(_PyUnicode_CheckConsistency(unicode, 1));
476 return unicode;
477}
478
479static PyObject*
480unicode_result(PyObject *unicode)
481{
482 assert(_PyUnicode_CHECK(unicode));
483 if (PyUnicode_IS_READY(unicode))
484 return unicode_result_ready(unicode);
485 else
486 return unicode_result_wchar(unicode);
487}
488
Victor Stinnerc4b49542011-12-11 22:44:26 +0100489static PyObject*
490unicode_result_unchanged(PyObject *unicode)
491{
492 if (PyUnicode_CheckExact(unicode)) {
493 if (PyUnicode_READY(unicode) < 0)
494 return NULL;
495 Py_INCREF(unicode);
496 return unicode;
497 }
498 else
499 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100500 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100501}
502
Victor Stinner3a50e702011-10-18 21:21:00 +0200503#ifdef HAVE_MBCS
504static OSVERSIONINFOEX winver;
505#endif
506
Thomas Wouters477c8d52006-05-27 19:21:47 +0000507/* --- Bloom Filters ----------------------------------------------------- */
508
509/* stuff to implement simple "bloom filters" for Unicode characters.
510 to keep things simple, we use a single bitmask, using the least 5
511 bits from each unicode characters as the bit index. */
512
513/* the linebreak mask is set up by Unicode_Init below */
514
Antoine Pitrouf068f942010-01-13 14:19:12 +0000515#if LONG_BIT >= 128
516#define BLOOM_WIDTH 128
517#elif LONG_BIT >= 64
518#define BLOOM_WIDTH 64
519#elif LONG_BIT >= 32
520#define BLOOM_WIDTH 32
521#else
522#error "LONG_BIT is smaller than 32"
523#endif
524
Thomas Wouters477c8d52006-05-27 19:21:47 +0000525#define BLOOM_MASK unsigned long
526
527static BLOOM_MASK bloom_linebreak;
528
Antoine Pitrouf068f942010-01-13 14:19:12 +0000529#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
530#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000531
Benjamin Peterson29060642009-01-31 22:14:21 +0000532#define BLOOM_LINEBREAK(ch) \
533 ((ch) < 128U ? ascii_linebreak[(ch)] : \
534 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000535
Alexander Belopolsky40018472011-02-26 01:02:56 +0000536Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200537make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000538{
539 /* calculate simple bloom-style bitmask for a given unicode string */
540
Antoine Pitrouf068f942010-01-13 14:19:12 +0000541 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000542 Py_ssize_t i;
543
544 mask = 0;
545 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200546 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000547
548 return mask;
549}
550
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200551#define BLOOM_MEMBER(mask, chr, str) \
552 (BLOOM(mask, chr) \
553 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000554
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200555/* Compilation of templated routines */
556
557#include "stringlib/asciilib.h"
558#include "stringlib/fastsearch.h"
559#include "stringlib/partition.h"
560#include "stringlib/split.h"
561#include "stringlib/count.h"
562#include "stringlib/find.h"
563#include "stringlib/find_max_char.h"
564#include "stringlib/localeutil.h"
565#include "stringlib/undef.h"
566
567#include "stringlib/ucs1lib.h"
568#include "stringlib/fastsearch.h"
569#include "stringlib/partition.h"
570#include "stringlib/split.h"
571#include "stringlib/count.h"
572#include "stringlib/find.h"
573#include "stringlib/find_max_char.h"
574#include "stringlib/localeutil.h"
575#include "stringlib/undef.h"
576
577#include "stringlib/ucs2lib.h"
578#include "stringlib/fastsearch.h"
579#include "stringlib/partition.h"
580#include "stringlib/split.h"
581#include "stringlib/count.h"
582#include "stringlib/find.h"
583#include "stringlib/find_max_char.h"
584#include "stringlib/localeutil.h"
585#include "stringlib/undef.h"
586
587#include "stringlib/ucs4lib.h"
588#include "stringlib/fastsearch.h"
589#include "stringlib/partition.h"
590#include "stringlib/split.h"
591#include "stringlib/count.h"
592#include "stringlib/find.h"
593#include "stringlib/find_max_char.h"
594#include "stringlib/localeutil.h"
595#include "stringlib/undef.h"
596
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200597#include "stringlib/unicodedefs.h"
598#include "stringlib/fastsearch.h"
599#include "stringlib/count.h"
600#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100601#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200602
Guido van Rossumd57fd912000-03-10 22:53:23 +0000603/* --- Unicode Object ----------------------------------------------------- */
604
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200605static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200606fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200607
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200608Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
609 Py_ssize_t size, Py_UCS4 ch,
610 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200611{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200612 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
613
614 switch (kind) {
615 case PyUnicode_1BYTE_KIND:
616 {
617 Py_UCS1 ch1 = (Py_UCS1) ch;
618 if (ch1 == ch)
619 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
620 else
621 return -1;
622 }
623 case PyUnicode_2BYTE_KIND:
624 {
625 Py_UCS2 ch2 = (Py_UCS2) ch;
626 if (ch2 == ch)
627 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
628 else
629 return -1;
630 }
631 case PyUnicode_4BYTE_KIND:
632 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
633 default:
634 assert(0);
635 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200636 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200637}
638
Victor Stinnerfe226c02011-10-03 03:52:20 +0200639static PyObject*
640resize_compact(PyObject *unicode, Py_ssize_t length)
641{
642 Py_ssize_t char_size;
643 Py_ssize_t struct_size;
644 Py_ssize_t new_size;
645 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100646 PyObject *new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200647 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100648 assert(PyUnicode_IS_COMPACT(unicode));
649
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200650 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100651 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200652 struct_size = sizeof(PyASCIIObject);
653 else
654 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200655 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200656
Victor Stinnerfe226c02011-10-03 03:52:20 +0200657 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
658 PyErr_NoMemory();
659 return NULL;
660 }
661 new_size = (struct_size + (length + 1) * char_size);
662
Victor Stinner84def372011-12-11 20:04:56 +0100663 _Py_DEC_REFTOTAL;
664 _Py_ForgetReference(unicode);
665
666 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
667 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100668 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200669 PyErr_NoMemory();
670 return NULL;
671 }
Victor Stinner84def372011-12-11 20:04:56 +0100672 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200673 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100674
Victor Stinnerfe226c02011-10-03 03:52:20 +0200675 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200676 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200677 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100678 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200679 _PyUnicode_WSTR_LENGTH(unicode) = length;
680 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200681 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
682 length, 0);
683 return unicode;
684}
685
Alexander Belopolsky40018472011-02-26 01:02:56 +0000686static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200687resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000688{
Victor Stinner95663112011-10-04 01:03:50 +0200689 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100690 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200691 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200692 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000693
Victor Stinnerfe226c02011-10-03 03:52:20 +0200694 if (PyUnicode_IS_READY(unicode)) {
695 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200696 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200697 void *data;
698
699 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200700 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200701 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
702 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200703
704 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
705 PyErr_NoMemory();
706 return -1;
707 }
708 new_size = (length + 1) * char_size;
709
Victor Stinner7a9105a2011-12-12 00:13:42 +0100710 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
711 {
712 PyObject_DEL(_PyUnicode_UTF8(unicode));
713 _PyUnicode_UTF8(unicode) = NULL;
714 _PyUnicode_UTF8_LENGTH(unicode) = 0;
715 }
716
Victor Stinnerfe226c02011-10-03 03:52:20 +0200717 data = (PyObject *)PyObject_REALLOC(data, new_size);
718 if (data == NULL) {
719 PyErr_NoMemory();
720 return -1;
721 }
722 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200723 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200724 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200725 _PyUnicode_WSTR_LENGTH(unicode) = length;
726 }
727 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200728 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200729 _PyUnicode_UTF8_LENGTH(unicode) = length;
730 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200731 _PyUnicode_LENGTH(unicode) = length;
732 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200733 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200734 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200735 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200736 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200737 }
Victor Stinner95663112011-10-04 01:03:50 +0200738 assert(_PyUnicode_WSTR(unicode) != NULL);
739
740 /* check for integer overflow */
741 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
742 PyErr_NoMemory();
743 return -1;
744 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100745 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200746 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100747 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200748 if (!wstr) {
749 PyErr_NoMemory();
750 return -1;
751 }
752 _PyUnicode_WSTR(unicode) = wstr;
753 _PyUnicode_WSTR(unicode)[length] = 0;
754 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200755 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000756 return 0;
757}
758
Victor Stinnerfe226c02011-10-03 03:52:20 +0200759static PyObject*
760resize_copy(PyObject *unicode, Py_ssize_t length)
761{
762 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100763 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200764 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100765
766 if (PyUnicode_READY(unicode) < 0)
767 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200768
769 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
770 if (copy == NULL)
771 return NULL;
772
773 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200774 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200775 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200776 }
777 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200778 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100779
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200780 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200781 if (w == NULL)
782 return NULL;
783 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
784 copy_length = Py_MIN(copy_length, length);
785 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
786 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200787 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200788 }
789}
790
Guido van Rossumd57fd912000-03-10 22:53:23 +0000791/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000792 Ux0000 terminated; some code (e.g. new_identifier)
793 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000794
795 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000796 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000797
798*/
799
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200800#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200801static int unicode_old_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200802#endif
803
Alexander Belopolsky40018472011-02-26 01:02:56 +0000804static PyUnicodeObject *
805_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000806{
807 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200808 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000809
Thomas Wouters477c8d52006-05-27 19:21:47 +0000810 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000811 if (length == 0 && unicode_empty != NULL) {
812 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200813 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000814 }
815
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000816 /* Ensure we won't overflow the size. */
817 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
818 return (PyUnicodeObject *)PyErr_NoMemory();
819 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200820 if (length < 0) {
821 PyErr_SetString(PyExc_SystemError,
822 "Negative size passed to _PyUnicode_New");
823 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000824 }
825
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200826#ifdef Py_DEBUG
827 ++unicode_old_new_calls;
828#endif
829
830 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
831 if (unicode == NULL)
832 return NULL;
833 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
834 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
835 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100836 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000837 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100838 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000839 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200840
Jeremy Hyltond8082792003-09-16 19:41:39 +0000841 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000842 * the caller fails before initializing str -- unicode_resize()
843 * reads str[0], and the Keep-Alive optimization can keep memory
844 * allocated for str alive across a call to unicode_dealloc(unicode).
845 * We don't want unicode_resize to read uninitialized memory in
846 * that case.
847 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200848 _PyUnicode_WSTR(unicode)[0] = 0;
849 _PyUnicode_WSTR(unicode)[length] = 0;
850 _PyUnicode_WSTR_LENGTH(unicode) = length;
851 _PyUnicode_HASH(unicode) = -1;
852 _PyUnicode_STATE(unicode).interned = 0;
853 _PyUnicode_STATE(unicode).kind = 0;
854 _PyUnicode_STATE(unicode).compact = 0;
855 _PyUnicode_STATE(unicode).ready = 0;
856 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200857 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200858 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200859 _PyUnicode_UTF8(unicode) = NULL;
860 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100861 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000862 return unicode;
863}
864
Victor Stinnerf42dc442011-10-02 23:33:16 +0200865static const char*
866unicode_kind_name(PyObject *unicode)
867{
Victor Stinner42dfd712011-10-03 14:41:45 +0200868 /* don't check consistency: unicode_kind_name() is called from
869 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200870 if (!PyUnicode_IS_COMPACT(unicode))
871 {
872 if (!PyUnicode_IS_READY(unicode))
873 return "wstr";
874 switch(PyUnicode_KIND(unicode))
875 {
876 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200877 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200878 return "legacy ascii";
879 else
880 return "legacy latin1";
881 case PyUnicode_2BYTE_KIND:
882 return "legacy UCS2";
883 case PyUnicode_4BYTE_KIND:
884 return "legacy UCS4";
885 default:
886 return "<legacy invalid kind>";
887 }
888 }
889 assert(PyUnicode_IS_READY(unicode));
890 switch(PyUnicode_KIND(unicode))
891 {
892 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200893 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200894 return "ascii";
895 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200896 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200897 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200898 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200899 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200900 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200901 default:
902 return "<invalid compact kind>";
903 }
904}
905
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200906#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200907static int unicode_new_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200908
909/* Functions wrapping macros for use in debugger */
910char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200911 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200912}
913
914void *_PyUnicode_compact_data(void *unicode) {
915 return _PyUnicode_COMPACT_DATA(unicode);
916}
917void *_PyUnicode_data(void *unicode){
918 printf("obj %p\n", unicode);
919 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
920 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
921 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
922 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
923 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
924 return PyUnicode_DATA(unicode);
925}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200926
927void
928_PyUnicode_Dump(PyObject *op)
929{
930 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200931 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
932 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
933 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200934
Victor Stinnera849a4b2011-10-03 12:12:11 +0200935 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200936 {
937 if (ascii->state.ascii)
938 data = (ascii + 1);
939 else
940 data = (compact + 1);
941 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200942 else
943 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200944 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
945
Victor Stinnera849a4b2011-10-03 12:12:11 +0200946 if (ascii->wstr == data)
947 printf("shared ");
948 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200949
Victor Stinnera3b334d2011-10-03 13:53:37 +0200950 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200951 printf(" (%zu), ", compact->wstr_length);
952 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
953 printf("shared ");
954 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200955 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200956 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200957}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200958#endif
959
960PyObject *
961PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
962{
963 PyObject *obj;
964 PyCompactUnicodeObject *unicode;
965 void *data;
966 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200967 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200968 Py_ssize_t char_size;
969 Py_ssize_t struct_size;
970
971 /* Optimization for empty strings */
972 if (size == 0 && unicode_empty != NULL) {
973 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200974 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200975 }
976
977#ifdef Py_DEBUG
978 ++unicode_new_new_calls;
979#endif
980
Victor Stinner9e9d6892011-10-04 01:02:02 +0200981 is_ascii = 0;
982 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200983 struct_size = sizeof(PyCompactUnicodeObject);
984 if (maxchar < 128) {
985 kind_state = PyUnicode_1BYTE_KIND;
986 char_size = 1;
987 is_ascii = 1;
988 struct_size = sizeof(PyASCIIObject);
989 }
990 else if (maxchar < 256) {
991 kind_state = PyUnicode_1BYTE_KIND;
992 char_size = 1;
993 }
994 else if (maxchar < 65536) {
995 kind_state = PyUnicode_2BYTE_KIND;
996 char_size = 2;
997 if (sizeof(wchar_t) == 2)
998 is_sharing = 1;
999 }
1000 else {
1001 kind_state = PyUnicode_4BYTE_KIND;
1002 char_size = 4;
1003 if (sizeof(wchar_t) == 4)
1004 is_sharing = 1;
1005 }
1006
1007 /* Ensure we won't overflow the size. */
1008 if (size < 0) {
1009 PyErr_SetString(PyExc_SystemError,
1010 "Negative size passed to PyUnicode_New");
1011 return NULL;
1012 }
1013 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1014 return PyErr_NoMemory();
1015
1016 /* Duplicated allocation code from _PyObject_New() instead of a call to
1017 * PyObject_New() so we are able to allocate space for the object and
1018 * it's data buffer.
1019 */
1020 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1021 if (obj == NULL)
1022 return PyErr_NoMemory();
1023 obj = PyObject_INIT(obj, &PyUnicode_Type);
1024 if (obj == NULL)
1025 return NULL;
1026
1027 unicode = (PyCompactUnicodeObject *)obj;
1028 if (is_ascii)
1029 data = ((PyASCIIObject*)obj) + 1;
1030 else
1031 data = unicode + 1;
1032 _PyUnicode_LENGTH(unicode) = size;
1033 _PyUnicode_HASH(unicode) = -1;
1034 _PyUnicode_STATE(unicode).interned = 0;
1035 _PyUnicode_STATE(unicode).kind = kind_state;
1036 _PyUnicode_STATE(unicode).compact = 1;
1037 _PyUnicode_STATE(unicode).ready = 1;
1038 _PyUnicode_STATE(unicode).ascii = is_ascii;
1039 if (is_ascii) {
1040 ((char*)data)[size] = 0;
1041 _PyUnicode_WSTR(unicode) = NULL;
1042 }
1043 else if (kind_state == PyUnicode_1BYTE_KIND) {
1044 ((char*)data)[size] = 0;
1045 _PyUnicode_WSTR(unicode) = NULL;
1046 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001047 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001048 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001049 }
1050 else {
1051 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001052 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001053 if (kind_state == PyUnicode_2BYTE_KIND)
1054 ((Py_UCS2*)data)[size] = 0;
1055 else /* kind_state == PyUnicode_4BYTE_KIND */
1056 ((Py_UCS4*)data)[size] = 0;
1057 if (is_sharing) {
1058 _PyUnicode_WSTR_LENGTH(unicode) = size;
1059 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1060 }
1061 else {
1062 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1063 _PyUnicode_WSTR(unicode) = NULL;
1064 }
1065 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01001066 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001067 return obj;
1068}
1069
1070#if SIZEOF_WCHAR_T == 2
1071/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1072 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001073 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001074
1075 This function assumes that unicode can hold one more code point than wstr
1076 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001077static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001078unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001079 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001080{
1081 const wchar_t *iter;
1082 Py_UCS4 *ucs4_out;
1083
Victor Stinner910337b2011-10-03 03:20:16 +02001084 assert(unicode != NULL);
1085 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001086 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1087 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1088
1089 for (iter = begin; iter < end; ) {
1090 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1091 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001092 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1093 && (iter+1) < end
1094 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001095 {
Victor Stinner551ac952011-11-29 22:58:13 +01001096 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001097 iter += 2;
1098 }
1099 else {
1100 *ucs4_out++ = *iter;
1101 iter++;
1102 }
1103 }
1104 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1105 _PyUnicode_GET_LENGTH(unicode)));
1106
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001107}
1108#endif
1109
Victor Stinnercd9950f2011-10-02 00:34:53 +02001110static int
Victor Stinner488fa492011-12-12 00:01:39 +01001111unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001112{
Victor Stinner488fa492011-12-12 00:01:39 +01001113 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001114 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001115 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001116 return -1;
1117 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001118 return 0;
1119}
1120
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001121static int
1122_copy_characters(PyObject *to, Py_ssize_t to_start,
1123 PyObject *from, Py_ssize_t from_start,
1124 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001125{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001126 unsigned int from_kind, to_kind;
1127 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001128 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001129
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001130 assert(PyUnicode_Check(from));
1131 assert(PyUnicode_Check(to));
1132 assert(PyUnicode_IS_READY(from));
1133 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001134
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001135 assert(PyUnicode_GET_LENGTH(from) >= how_many);
1136 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1137 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001138
Victor Stinnerf5ca1a22011-09-28 23:54:59 +02001139 if (how_many == 0)
1140 return 0;
1141
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001142 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001143 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001144 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001145 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001146
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001147#ifdef Py_DEBUG
1148 if (!check_maxchar
1149 && (from_kind > to_kind
1150 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001151 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001152 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1153 Py_UCS4 ch;
1154 Py_ssize_t i;
1155 for (i=0; i < how_many; i++) {
1156 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1157 assert(ch <= to_maxchar);
1158 }
1159 }
1160#endif
1161 fast = (from_kind == to_kind);
1162 if (check_maxchar
1163 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1164 {
1165 /* deny latin1 => ascii */
1166 fast = 0;
1167 }
1168
1169 if (fast) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001170 Py_MEMCPY((char*)to_data + to_kind * to_start,
1171 (char*)from_data + from_kind * from_start,
1172 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001173 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001174 else if (from_kind == PyUnicode_1BYTE_KIND
1175 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001176 {
1177 _PyUnicode_CONVERT_BYTES(
1178 Py_UCS1, Py_UCS2,
1179 PyUnicode_1BYTE_DATA(from) + from_start,
1180 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1181 PyUnicode_2BYTE_DATA(to) + to_start
1182 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001183 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001184 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001185 && to_kind == PyUnicode_4BYTE_KIND)
1186 {
1187 _PyUnicode_CONVERT_BYTES(
1188 Py_UCS1, Py_UCS4,
1189 PyUnicode_1BYTE_DATA(from) + from_start,
1190 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1191 PyUnicode_4BYTE_DATA(to) + to_start
1192 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001193 }
1194 else if (from_kind == PyUnicode_2BYTE_KIND
1195 && to_kind == PyUnicode_4BYTE_KIND)
1196 {
1197 _PyUnicode_CONVERT_BYTES(
1198 Py_UCS2, Py_UCS4,
1199 PyUnicode_2BYTE_DATA(from) + from_start,
1200 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1201 PyUnicode_4BYTE_DATA(to) + to_start
1202 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001203 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001204 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001205 /* check if max_char(from substring) <= max_char(to) */
1206 if (from_kind > to_kind
1207 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001208 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001209 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001210 /* slow path to check for character overflow */
1211 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001212 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001213 Py_ssize_t i;
1214
Victor Stinner56c161a2011-10-06 02:47:11 +02001215#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001216 for (i=0; i < how_many; i++) {
1217 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001218 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001219 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1220 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001221#else
1222 if (!check_maxchar) {
1223 for (i=0; i < how_many; i++) {
1224 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1225 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1226 }
1227 }
1228 else {
1229 for (i=0; i < how_many; i++) {
1230 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1231 if (ch > to_maxchar)
1232 return 1;
1233 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1234 }
1235 }
1236#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001237 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001238 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001239 assert(0 && "inconsistent state");
1240 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001241 }
1242 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001243 return 0;
1244}
1245
1246static void
1247copy_characters(PyObject *to, Py_ssize_t to_start,
1248 PyObject *from, Py_ssize_t from_start,
1249 Py_ssize_t how_many)
1250{
1251 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1252}
1253
1254Py_ssize_t
1255PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1256 PyObject *from, Py_ssize_t from_start,
1257 Py_ssize_t how_many)
1258{
1259 int err;
1260
1261 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1262 PyErr_BadInternalCall();
1263 return -1;
1264 }
1265
1266 if (PyUnicode_READY(from))
1267 return -1;
1268 if (PyUnicode_READY(to))
1269 return -1;
1270
1271 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1272 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1273 PyErr_Format(PyExc_SystemError,
1274 "Cannot write %zi characters at %zi "
1275 "in a string of %zi characters",
1276 how_many, to_start, PyUnicode_GET_LENGTH(to));
1277 return -1;
1278 }
1279
1280 if (how_many == 0)
1281 return 0;
1282
Victor Stinner488fa492011-12-12 00:01:39 +01001283 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001284 return -1;
1285
1286 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1287 if (err) {
1288 PyErr_Format(PyExc_SystemError,
1289 "Cannot copy %s characters "
1290 "into a string of %s characters",
1291 unicode_kind_name(from),
1292 unicode_kind_name(to));
1293 return -1;
1294 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001295 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001296}
1297
Victor Stinner17222162011-09-28 22:15:37 +02001298/* Find the maximum code point and count the number of surrogate pairs so a
1299 correct string length can be computed before converting a string to UCS4.
1300 This function counts single surrogates as a character and not as a pair.
1301
1302 Return 0 on success, or -1 on error. */
1303static int
1304find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1305 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001306{
1307 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001308 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001309
Victor Stinnerc53be962011-10-02 21:33:54 +02001310 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001311 *num_surrogates = 0;
1312 *maxchar = 0;
1313
1314 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001315#if SIZEOF_WCHAR_T == 2
Victor Stinnerca4f2072011-11-22 03:38:40 +01001316 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1317 && (iter+1) < end
1318 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001319 {
Victor Stinner8faf8212011-12-08 22:14:11 +01001320 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001321 ++(*num_surrogates);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001322 iter += 2;
1323 }
1324 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001325#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001326 {
1327 ch = *iter;
1328 iter++;
1329 }
1330 if (ch > *maxchar) {
1331 *maxchar = ch;
1332 if (*maxchar > MAX_UNICODE) {
1333 PyErr_Format(PyExc_ValueError,
1334 "character U+%x is not in range [U+0000; U+10ffff]",
1335 ch);
1336 return -1;
1337 }
1338 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001339 }
1340 return 0;
1341}
1342
1343#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001344static int unicode_ready_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001345#endif
1346
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001347int
1348_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001349{
1350 wchar_t *end;
1351 Py_UCS4 maxchar = 0;
1352 Py_ssize_t num_surrogates;
1353#if SIZEOF_WCHAR_T == 2
1354 Py_ssize_t length_wo_surrogates;
1355#endif
1356
Georg Brandl7597add2011-10-05 16:36:47 +02001357 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001358 strings were created using _PyObject_New() and where no canonical
1359 representation (the str field) has been set yet aka strings
1360 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001361 assert(_PyUnicode_CHECK(unicode));
1362 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001363 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001364 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001365 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001366 /* Actually, it should neither be interned nor be anything else: */
1367 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001368
1369#ifdef Py_DEBUG
1370 ++unicode_ready_calls;
1371#endif
1372
1373 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001374 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001375 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001376 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001377
1378 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001379 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1380 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001381 PyErr_NoMemory();
1382 return -1;
1383 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001384 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001385 _PyUnicode_WSTR(unicode), end,
1386 PyUnicode_1BYTE_DATA(unicode));
1387 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1388 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1389 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1390 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001391 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001392 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001393 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001394 }
1395 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001396 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001397 _PyUnicode_UTF8(unicode) = NULL;
1398 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001399 }
1400 PyObject_FREE(_PyUnicode_WSTR(unicode));
1401 _PyUnicode_WSTR(unicode) = NULL;
1402 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1403 }
1404 /* In this case we might have to convert down from 4-byte native
1405 wchar_t to 2-byte unicode. */
1406 else if (maxchar < 65536) {
1407 assert(num_surrogates == 0 &&
1408 "FindMaxCharAndNumSurrogatePairs() messed up");
1409
Victor Stinner506f5922011-09-28 22:34:18 +02001410#if SIZEOF_WCHAR_T == 2
1411 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001412 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001413 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1414 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1415 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001416 _PyUnicode_UTF8(unicode) = NULL;
1417 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001418#else
1419 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001420 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001421 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001422 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001423 PyErr_NoMemory();
1424 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001425 }
Victor Stinner506f5922011-09-28 22:34:18 +02001426 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1427 _PyUnicode_WSTR(unicode), end,
1428 PyUnicode_2BYTE_DATA(unicode));
1429 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1430 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1431 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001432 _PyUnicode_UTF8(unicode) = NULL;
1433 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001434 PyObject_FREE(_PyUnicode_WSTR(unicode));
1435 _PyUnicode_WSTR(unicode) = NULL;
1436 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1437#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001438 }
1439 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1440 else {
1441#if SIZEOF_WCHAR_T == 2
1442 /* in case the native representation is 2-bytes, we need to allocate a
1443 new normalized 4-byte version. */
1444 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001445 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1446 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001447 PyErr_NoMemory();
1448 return -1;
1449 }
1450 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1451 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001452 _PyUnicode_UTF8(unicode) = NULL;
1453 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001454 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1455 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001456 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001457 PyObject_FREE(_PyUnicode_WSTR(unicode));
1458 _PyUnicode_WSTR(unicode) = NULL;
1459 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1460#else
1461 assert(num_surrogates == 0);
1462
Victor Stinnerc3c74152011-10-02 20:39:55 +02001463 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001464 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001465 _PyUnicode_UTF8(unicode) = NULL;
1466 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001467 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1468#endif
1469 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1470 }
1471 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001472 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001473 return 0;
1474}
1475
Alexander Belopolsky40018472011-02-26 01:02:56 +00001476static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001477unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001478{
Walter Dörwald16807132007-05-25 13:52:07 +00001479 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001480 case SSTATE_NOT_INTERNED:
1481 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001482
Benjamin Peterson29060642009-01-31 22:14:21 +00001483 case SSTATE_INTERNED_MORTAL:
1484 /* revive dead object temporarily for DelItem */
1485 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001486 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001487 Py_FatalError(
1488 "deletion of interned string failed");
1489 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001490
Benjamin Peterson29060642009-01-31 22:14:21 +00001491 case SSTATE_INTERNED_IMMORTAL:
1492 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001493
Benjamin Peterson29060642009-01-31 22:14:21 +00001494 default:
1495 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001496 }
1497
Victor Stinner03490912011-10-03 23:45:12 +02001498 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001499 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001500 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001501 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001502 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1503 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001504
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001505 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001506}
1507
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001508#ifdef Py_DEBUG
1509static int
1510unicode_is_singleton(PyObject *unicode)
1511{
1512 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1513 if (unicode == unicode_empty)
1514 return 1;
1515 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1516 {
1517 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1518 if (ch < 256 && unicode_latin1[ch] == unicode)
1519 return 1;
1520 }
1521 return 0;
1522}
1523#endif
1524
Alexander Belopolsky40018472011-02-26 01:02:56 +00001525static int
Victor Stinner488fa492011-12-12 00:01:39 +01001526unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001527{
Victor Stinner488fa492011-12-12 00:01:39 +01001528 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001529 if (Py_REFCNT(unicode) != 1)
1530 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001531 if (_PyUnicode_HASH(unicode) != -1)
1532 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001533 if (PyUnicode_CHECK_INTERNED(unicode))
1534 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001535 if (!PyUnicode_CheckExact(unicode))
1536 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001537#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001538 /* singleton refcount is greater than 1 */
1539 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001540#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001541 return 1;
1542}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001543
Victor Stinnerfe226c02011-10-03 03:52:20 +02001544static int
1545unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1546{
1547 PyObject *unicode;
1548 Py_ssize_t old_length;
1549
1550 assert(p_unicode != NULL);
1551 unicode = *p_unicode;
1552
1553 assert(unicode != NULL);
1554 assert(PyUnicode_Check(unicode));
1555 assert(0 <= length);
1556
Victor Stinner910337b2011-10-03 03:20:16 +02001557 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001558 old_length = PyUnicode_WSTR_LENGTH(unicode);
1559 else
1560 old_length = PyUnicode_GET_LENGTH(unicode);
1561 if (old_length == length)
1562 return 0;
1563
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001564 if (length == 0) {
1565 Py_DECREF(*p_unicode);
1566 *p_unicode = unicode_empty;
1567 Py_INCREF(*p_unicode);
1568 return 0;
1569 }
1570
Victor Stinner488fa492011-12-12 00:01:39 +01001571 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001572 PyObject *copy = resize_copy(unicode, length);
1573 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001574 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001575 Py_DECREF(*p_unicode);
1576 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001577 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001578 }
1579
Victor Stinnerfe226c02011-10-03 03:52:20 +02001580 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001581 PyObject *new_unicode = resize_compact(unicode, length);
1582 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001583 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001584 *p_unicode = new_unicode;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001585 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001586 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001587 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001588 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001589}
1590
Alexander Belopolsky40018472011-02-26 01:02:56 +00001591int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001592PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001593{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001594 PyObject *unicode;
1595 if (p_unicode == NULL) {
1596 PyErr_BadInternalCall();
1597 return -1;
1598 }
1599 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001600 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001601 {
1602 PyErr_BadInternalCall();
1603 return -1;
1604 }
1605 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001606}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001607
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001608static int
Victor Stinner0a045ef2011-11-09 00:02:42 +01001609unicode_widen(PyObject **p_unicode, unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001610{
1611 PyObject *result;
1612 assert(PyUnicode_IS_READY(*p_unicode));
1613 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1614 return 0;
1615 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1616 maxchar);
1617 if (result == NULL)
1618 return -1;
1619 PyUnicode_CopyCharacters(result, 0, *p_unicode, 0,
1620 PyUnicode_GET_LENGTH(*p_unicode));
1621 Py_DECREF(*p_unicode);
1622 *p_unicode = result;
1623 return 0;
1624}
1625
1626static int
1627unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1628 Py_UCS4 ch)
1629{
1630 if (unicode_widen(p_unicode, ch) < 0)
1631 return -1;
1632 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1633 PyUnicode_DATA(*p_unicode),
1634 (*pos)++, ch);
1635 return 0;
1636}
1637
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001638static PyObject*
1639get_latin1_char(unsigned char ch)
1640{
Victor Stinnera464fc12011-10-02 20:39:30 +02001641 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001642 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001643 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001644 if (!unicode)
1645 return NULL;
1646 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001647 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001648 unicode_latin1[ch] = unicode;
1649 }
1650 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001651 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001652}
1653
Alexander Belopolsky40018472011-02-26 01:02:56 +00001654PyObject *
1655PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001656{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001657 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001658 Py_UCS4 maxchar = 0;
1659 Py_ssize_t num_surrogates;
1660
1661 if (u == NULL)
1662 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001663
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001664 /* If the Unicode data is known at construction time, we can apply
1665 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001666
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001667 /* Optimization for empty strings */
1668 if (size == 0 && unicode_empty != NULL) {
1669 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001670 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001671 }
Tim Petersced69f82003-09-16 20:30:58 +00001672
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001673 /* Single character Unicode objects in the Latin-1 range are
1674 shared when using this constructor */
1675 if (size == 1 && *u < 256)
1676 return get_latin1_char((unsigned char)*u);
1677
1678 /* If not empty and not single character, copy the Unicode data
1679 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001680 if (find_maxchar_surrogates(u, u + size,
1681 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001682 return NULL;
1683
Victor Stinner8faf8212011-12-08 22:14:11 +01001684 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001685 if (!unicode)
1686 return NULL;
1687
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001688 switch (PyUnicode_KIND(unicode)) {
1689 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001690 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001691 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1692 break;
1693 case PyUnicode_2BYTE_KIND:
1694#if Py_UNICODE_SIZE == 2
1695 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1696#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001697 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001698 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1699#endif
1700 break;
1701 case PyUnicode_4BYTE_KIND:
1702#if SIZEOF_WCHAR_T == 2
1703 /* This is the only case which has to process surrogates, thus
1704 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001705 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001706#else
1707 assert(num_surrogates == 0);
1708 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1709#endif
1710 break;
1711 default:
1712 assert(0 && "Impossible state");
1713 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001714
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001715 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001716}
1717
Alexander Belopolsky40018472011-02-26 01:02:56 +00001718PyObject *
1719PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001720{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001721 if (size < 0) {
1722 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001723 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001724 return NULL;
1725 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001726 if (u != NULL)
1727 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1728 else
1729 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001730}
1731
Alexander Belopolsky40018472011-02-26 01:02:56 +00001732PyObject *
1733PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001734{
1735 size_t size = strlen(u);
1736 if (size > PY_SSIZE_T_MAX) {
1737 PyErr_SetString(PyExc_OverflowError, "input too long");
1738 return NULL;
1739 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001740 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001741}
1742
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001743PyObject *
1744_PyUnicode_FromId(_Py_Identifier *id)
1745{
1746 if (!id->object) {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001747 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1748 strlen(id->string),
1749 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001750 if (!id->object)
1751 return NULL;
1752 PyUnicode_InternInPlace(&id->object);
1753 assert(!id->next);
1754 id->next = static_strings;
1755 static_strings = id;
1756 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001757 return id->object;
1758}
1759
1760void
1761_PyUnicode_ClearStaticStrings()
1762{
1763 _Py_Identifier *i;
1764 for (i = static_strings; i; i = i->next) {
1765 Py_DECREF(i->object);
1766 i->object = NULL;
1767 i->next = NULL;
1768 }
1769}
1770
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001771/* Internal function, don't check maximum character */
1772
Victor Stinnere57b1c02011-09-28 22:20:48 +02001773static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001774unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001775{
Victor Stinner785938e2011-12-11 20:09:03 +01001776 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001777 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001778#ifdef Py_DEBUG
Victor Stinnere6b2d442011-12-11 21:54:30 +01001779 assert(s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001780#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001781 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001782 }
Victor Stinner785938e2011-12-11 20:09:03 +01001783 unicode = PyUnicode_New(size, 127);
1784 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001785 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001786 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1787 assert(_PyUnicode_CheckConsistency(unicode, 1));
1788 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001789}
1790
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001791static Py_UCS4
1792kind_maxchar_limit(unsigned int kind)
1793{
1794 switch(kind) {
1795 case PyUnicode_1BYTE_KIND:
1796 return 0x80;
1797 case PyUnicode_2BYTE_KIND:
1798 return 0x100;
1799 case PyUnicode_4BYTE_KIND:
1800 return 0x10000;
1801 default:
1802 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001803 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001804 }
1805}
1806
Victor Stinner702c7342011-10-05 13:50:52 +02001807static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001808_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001809{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001810 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001811 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001812
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001813 if (size == 0) {
1814 Py_INCREF(unicode_empty);
1815 return unicode_empty;
1816 }
1817 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001818 if (size == 1)
1819 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001820
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001821 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001822 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001823 if (!res)
1824 return NULL;
1825 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001826 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001827 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001828}
1829
Victor Stinnere57b1c02011-09-28 22:20:48 +02001830static PyObject*
1831_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001832{
1833 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001834 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001835
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001836 if (size == 0) {
1837 Py_INCREF(unicode_empty);
1838 return unicode_empty;
1839 }
1840 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001841 if (size == 1 && u[0] < 256)
Victor Stinner4e101002011-10-11 23:27:52 +02001842 return get_latin1_char((unsigned char)u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001843
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001844 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001845 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001846 if (!res)
1847 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001848 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001849 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001850 else {
1851 _PyUnicode_CONVERT_BYTES(
1852 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1853 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001854 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001855 return res;
1856}
1857
Victor Stinnere57b1c02011-09-28 22:20:48 +02001858static PyObject*
1859_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001860{
1861 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001862 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001863
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001864 if (size == 0) {
1865 Py_INCREF(unicode_empty);
1866 return unicode_empty;
1867 }
1868 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001869 if (size == 1 && u[0] < 256)
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001870 return get_latin1_char((unsigned char)u[0]);
1871
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001872 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001873 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001874 if (!res)
1875 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001876 if (max_char < 256)
1877 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1878 PyUnicode_1BYTE_DATA(res));
1879 else if (max_char < 0x10000)
1880 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1881 PyUnicode_2BYTE_DATA(res));
1882 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001883 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001884 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001885 return res;
1886}
1887
1888PyObject*
1889PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1890{
Victor Stinnercfed46e2011-11-22 01:29:14 +01001891 if (size < 0) {
1892 PyErr_SetString(PyExc_ValueError, "size must be positive");
1893 return NULL;
1894 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001895 switch(kind) {
1896 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001897 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001898 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001899 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001900 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001901 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001902 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02001903 PyErr_SetString(PyExc_SystemError, "invalid kind");
1904 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001905 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001906}
1907
Victor Stinner25a4b292011-10-06 12:31:55 +02001908/* Ensure that a string uses the most efficient storage, if it is not the
1909 case: create a new string with of the right kind. Write NULL into *p_unicode
1910 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001911static void
Victor Stinner25a4b292011-10-06 12:31:55 +02001912unicode_adjust_maxchar(PyObject **p_unicode)
1913{
1914 PyObject *unicode, *copy;
1915 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001916 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02001917 unsigned int kind;
1918
1919 assert(p_unicode != NULL);
1920 unicode = *p_unicode;
1921 assert(PyUnicode_IS_READY(unicode));
1922 if (PyUnicode_IS_ASCII(unicode))
1923 return;
1924
1925 len = PyUnicode_GET_LENGTH(unicode);
1926 kind = PyUnicode_KIND(unicode);
1927 if (kind == PyUnicode_1BYTE_KIND) {
1928 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001929 max_char = ucs1lib_find_max_char(u, u + len);
1930 if (max_char >= 128)
1931 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001932 }
1933 else if (kind == PyUnicode_2BYTE_KIND) {
1934 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001935 max_char = ucs2lib_find_max_char(u, u + len);
1936 if (max_char >= 256)
1937 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001938 }
1939 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001940 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02001941 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001942 max_char = ucs4lib_find_max_char(u, u + len);
1943 if (max_char >= 0x10000)
1944 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001945 }
Victor Stinner25a4b292011-10-06 12:31:55 +02001946 copy = PyUnicode_New(len, max_char);
1947 copy_characters(copy, 0, unicode, 0, len);
1948 Py_DECREF(unicode);
1949 *p_unicode = copy;
1950}
1951
Victor Stinner034f6cf2011-09-30 02:26:44 +02001952PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01001953_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02001954{
Victor Stinner87af4f22011-11-21 23:03:47 +01001955 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001956 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001957
Victor Stinner034f6cf2011-09-30 02:26:44 +02001958 if (!PyUnicode_Check(unicode)) {
1959 PyErr_BadInternalCall();
1960 return NULL;
1961 }
1962 if (PyUnicode_READY(unicode))
1963 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001964
Victor Stinner87af4f22011-11-21 23:03:47 +01001965 length = PyUnicode_GET_LENGTH(unicode);
1966 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001967 if (!copy)
1968 return NULL;
1969 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1970
Victor Stinner87af4f22011-11-21 23:03:47 +01001971 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
1972 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001973 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001974 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001975}
1976
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001977
Victor Stinnerbc603d12011-10-02 01:00:40 +02001978/* Widen Unicode objects to larger buffers. Don't write terminating null
1979 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001980
1981void*
1982_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1983{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001984 Py_ssize_t len;
1985 void *result;
1986 unsigned int skind;
1987
1988 if (PyUnicode_READY(s))
1989 return NULL;
1990
1991 len = PyUnicode_GET_LENGTH(s);
1992 skind = PyUnicode_KIND(s);
1993 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02001994 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001995 return NULL;
1996 }
1997 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001998 case PyUnicode_2BYTE_KIND:
1999 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2000 if (!result)
2001 return PyErr_NoMemory();
2002 assert(skind == PyUnicode_1BYTE_KIND);
2003 _PyUnicode_CONVERT_BYTES(
2004 Py_UCS1, Py_UCS2,
2005 PyUnicode_1BYTE_DATA(s),
2006 PyUnicode_1BYTE_DATA(s) + len,
2007 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002008 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002009 case PyUnicode_4BYTE_KIND:
2010 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2011 if (!result)
2012 return PyErr_NoMemory();
2013 if (skind == PyUnicode_2BYTE_KIND) {
2014 _PyUnicode_CONVERT_BYTES(
2015 Py_UCS2, Py_UCS4,
2016 PyUnicode_2BYTE_DATA(s),
2017 PyUnicode_2BYTE_DATA(s) + len,
2018 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002019 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002020 else {
2021 assert(skind == PyUnicode_1BYTE_KIND);
2022 _PyUnicode_CONVERT_BYTES(
2023 Py_UCS1, Py_UCS4,
2024 PyUnicode_1BYTE_DATA(s),
2025 PyUnicode_1BYTE_DATA(s) + len,
2026 result);
2027 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002028 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002029 default:
2030 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002031 }
Victor Stinner01698042011-10-04 00:04:26 +02002032 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002033 return NULL;
2034}
2035
2036static Py_UCS4*
2037as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2038 int copy_null)
2039{
2040 int kind;
2041 void *data;
2042 Py_ssize_t len, targetlen;
2043 if (PyUnicode_READY(string) == -1)
2044 return NULL;
2045 kind = PyUnicode_KIND(string);
2046 data = PyUnicode_DATA(string);
2047 len = PyUnicode_GET_LENGTH(string);
2048 targetlen = len;
2049 if (copy_null)
2050 targetlen++;
2051 if (!target) {
2052 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2053 PyErr_NoMemory();
2054 return NULL;
2055 }
2056 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2057 if (!target) {
2058 PyErr_NoMemory();
2059 return NULL;
2060 }
2061 }
2062 else {
2063 if (targetsize < targetlen) {
2064 PyErr_Format(PyExc_SystemError,
2065 "string is longer than the buffer");
2066 if (copy_null && 0 < targetsize)
2067 target[0] = 0;
2068 return NULL;
2069 }
2070 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002071 if (kind == PyUnicode_1BYTE_KIND) {
2072 Py_UCS1 *start = (Py_UCS1 *) data;
2073 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002074 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002075 else if (kind == PyUnicode_2BYTE_KIND) {
2076 Py_UCS2 *start = (Py_UCS2 *) data;
2077 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2078 }
2079 else {
2080 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002081 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002082 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002083 if (copy_null)
2084 target[len] = 0;
2085 return target;
2086}
2087
2088Py_UCS4*
2089PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2090 int copy_null)
2091{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002092 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002093 PyErr_BadInternalCall();
2094 return NULL;
2095 }
2096 return as_ucs4(string, target, targetsize, copy_null);
2097}
2098
2099Py_UCS4*
2100PyUnicode_AsUCS4Copy(PyObject *string)
2101{
2102 return as_ucs4(string, NULL, 0, 1);
2103}
2104
2105#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002106
Alexander Belopolsky40018472011-02-26 01:02:56 +00002107PyObject *
2108PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002109{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002110 if (w == NULL) {
Victor Stinner382955f2011-12-11 21:44:00 +01002111 if (size == 0) {
2112 Py_INCREF(unicode_empty);
2113 return unicode_empty;
2114 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002115 PyErr_BadInternalCall();
2116 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002117 }
2118
Martin v. Löwis790465f2008-04-05 20:41:37 +00002119 if (size == -1) {
2120 size = wcslen(w);
2121 }
2122
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002123 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002124}
2125
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002126#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002127
Walter Dörwald346737f2007-05-31 10:44:43 +00002128static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002129makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2130 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002131{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002132 *fmt++ = '%';
2133 if (width) {
2134 if (zeropad)
2135 *fmt++ = '0';
2136 fmt += sprintf(fmt, "%d", width);
2137 }
2138 if (precision)
2139 fmt += sprintf(fmt, ".%d", precision);
2140 if (longflag)
2141 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002142 else if (longlongflag) {
2143 /* longlongflag should only ever be nonzero on machines with
2144 HAVE_LONG_LONG defined */
2145#ifdef HAVE_LONG_LONG
2146 char *f = PY_FORMAT_LONG_LONG;
2147 while (*f)
2148 *fmt++ = *f++;
2149#else
2150 /* we shouldn't ever get here */
2151 assert(0);
2152 *fmt++ = 'l';
2153#endif
2154 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002155 else if (size_tflag) {
2156 char *f = PY_FORMAT_SIZE_T;
2157 while (*f)
2158 *fmt++ = *f++;
2159 }
2160 *fmt++ = c;
2161 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002162}
2163
Victor Stinner96865452011-03-01 23:44:09 +00002164/* helper for PyUnicode_FromFormatV() */
2165
2166static const char*
2167parse_format_flags(const char *f,
2168 int *p_width, int *p_precision,
2169 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2170{
2171 int width, precision, longflag, longlongflag, size_tflag;
2172
2173 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2174 f++;
2175 width = 0;
2176 while (Py_ISDIGIT((unsigned)*f))
2177 width = (width*10) + *f++ - '0';
2178 precision = 0;
2179 if (*f == '.') {
2180 f++;
2181 while (Py_ISDIGIT((unsigned)*f))
2182 precision = (precision*10) + *f++ - '0';
2183 if (*f == '%') {
2184 /* "%.3%s" => f points to "3" */
2185 f--;
2186 }
2187 }
2188 if (*f == '\0') {
2189 /* bogus format "%.1" => go backward, f points to "1" */
2190 f--;
2191 }
2192 if (p_width != NULL)
2193 *p_width = width;
2194 if (p_precision != NULL)
2195 *p_precision = precision;
2196
2197 /* Handle %ld, %lu, %lld and %llu. */
2198 longflag = 0;
2199 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002200 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002201
2202 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002203 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002204 longflag = 1;
2205 ++f;
2206 }
2207#ifdef HAVE_LONG_LONG
2208 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002209 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002210 longlongflag = 1;
2211 f += 2;
2212 }
2213#endif
2214 }
2215 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002216 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002217 size_tflag = 1;
2218 ++f;
2219 }
2220 if (p_longflag != NULL)
2221 *p_longflag = longflag;
2222 if (p_longlongflag != NULL)
2223 *p_longlongflag = longlongflag;
2224 if (p_size_tflag != NULL)
2225 *p_size_tflag = size_tflag;
2226 return f;
2227}
2228
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002229/* maximum number of characters required for output of %ld. 21 characters
2230 allows for 64-bit integers (in decimal) and an optional sign. */
2231#define MAX_LONG_CHARS 21
2232/* maximum number of characters required for output of %lld.
2233 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2234 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2235#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2236
Walter Dörwaldd2034312007-05-18 16:29:38 +00002237PyObject *
2238PyUnicode_FromFormatV(const char *format, va_list vargs)
2239{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002240 va_list count;
2241 Py_ssize_t callcount = 0;
2242 PyObject **callresults = NULL;
2243 PyObject **callresult = NULL;
2244 Py_ssize_t n = 0;
2245 int width = 0;
2246 int precision = 0;
2247 int zeropad;
2248 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002249 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002250 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002251 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002252 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2253 Py_UCS4 argmaxchar;
2254 Py_ssize_t numbersize = 0;
2255 char *numberresults = NULL;
2256 char *numberresult = NULL;
2257 Py_ssize_t i;
2258 int kind;
2259 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002260
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002261 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002262 /* step 1: count the number of %S/%R/%A/%s format specifications
2263 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2264 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002265 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002266 * also estimate a upper bound for all the number formats in the string,
2267 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002268 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002269 for (f = format; *f; f++) {
2270 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002271 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002272 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2273 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2274 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2275 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002276
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002277 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002278#ifdef HAVE_LONG_LONG
2279 if (longlongflag) {
2280 if (width < MAX_LONG_LONG_CHARS)
2281 width = MAX_LONG_LONG_CHARS;
2282 }
2283 else
2284#endif
2285 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2286 including sign. Decimal takes the most space. This
2287 isn't enough for octal. If a width is specified we
2288 need more (which we allocate later). */
2289 if (width < MAX_LONG_CHARS)
2290 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002291
2292 /* account for the size + '\0' to separate numbers
2293 inside of the numberresults buffer */
2294 numbersize += (width + 1);
2295 }
2296 }
2297 else if ((unsigned char)*f > 127) {
2298 PyErr_Format(PyExc_ValueError,
2299 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2300 "string, got a non-ASCII byte: 0x%02x",
2301 (unsigned char)*f);
2302 return NULL;
2303 }
2304 }
2305 /* step 2: allocate memory for the results of
2306 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2307 if (callcount) {
2308 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2309 if (!callresults) {
2310 PyErr_NoMemory();
2311 return NULL;
2312 }
2313 callresult = callresults;
2314 }
2315 /* step 2.5: allocate memory for the results of formating numbers */
2316 if (numbersize) {
2317 numberresults = PyObject_Malloc(numbersize);
2318 if (!numberresults) {
2319 PyErr_NoMemory();
2320 goto fail;
2321 }
2322 numberresult = numberresults;
2323 }
2324
2325 /* step 3: format numbers and figure out how large a buffer we need */
2326 for (f = format; *f; f++) {
2327 if (*f == '%') {
2328 const char* p;
2329 int longflag;
2330 int longlongflag;
2331 int size_tflag;
2332 int numprinted;
2333
2334 p = f;
2335 zeropad = (f[1] == '0');
2336 f = parse_format_flags(f, &width, &precision,
2337 &longflag, &longlongflag, &size_tflag);
2338 switch (*f) {
2339 case 'c':
2340 {
2341 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002342 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002343 n++;
2344 break;
2345 }
2346 case '%':
2347 n++;
2348 break;
2349 case 'i':
2350 case 'd':
2351 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2352 width, precision, *f);
2353 if (longflag)
2354 numprinted = sprintf(numberresult, fmt,
2355 va_arg(count, long));
2356#ifdef HAVE_LONG_LONG
2357 else if (longlongflag)
2358 numprinted = sprintf(numberresult, fmt,
2359 va_arg(count, PY_LONG_LONG));
2360#endif
2361 else if (size_tflag)
2362 numprinted = sprintf(numberresult, fmt,
2363 va_arg(count, Py_ssize_t));
2364 else
2365 numprinted = sprintf(numberresult, fmt,
2366 va_arg(count, int));
2367 n += numprinted;
2368 /* advance by +1 to skip over the '\0' */
2369 numberresult += (numprinted + 1);
2370 assert(*(numberresult - 1) == '\0');
2371 assert(*(numberresult - 2) != '\0');
2372 assert(numprinted >= 0);
2373 assert(numberresult <= numberresults + numbersize);
2374 break;
2375 case 'u':
2376 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2377 width, precision, 'u');
2378 if (longflag)
2379 numprinted = sprintf(numberresult, fmt,
2380 va_arg(count, unsigned long));
2381#ifdef HAVE_LONG_LONG
2382 else if (longlongflag)
2383 numprinted = sprintf(numberresult, fmt,
2384 va_arg(count, unsigned PY_LONG_LONG));
2385#endif
2386 else if (size_tflag)
2387 numprinted = sprintf(numberresult, fmt,
2388 va_arg(count, size_t));
2389 else
2390 numprinted = sprintf(numberresult, fmt,
2391 va_arg(count, unsigned int));
2392 n += numprinted;
2393 numberresult += (numprinted + 1);
2394 assert(*(numberresult - 1) == '\0');
2395 assert(*(numberresult - 2) != '\0');
2396 assert(numprinted >= 0);
2397 assert(numberresult <= numberresults + numbersize);
2398 break;
2399 case 'x':
2400 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2401 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2402 n += numprinted;
2403 numberresult += (numprinted + 1);
2404 assert(*(numberresult - 1) == '\0');
2405 assert(*(numberresult - 2) != '\0');
2406 assert(numprinted >= 0);
2407 assert(numberresult <= numberresults + numbersize);
2408 break;
2409 case 'p':
2410 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2411 /* %p is ill-defined: ensure leading 0x. */
2412 if (numberresult[1] == 'X')
2413 numberresult[1] = 'x';
2414 else if (numberresult[1] != 'x') {
2415 memmove(numberresult + 2, numberresult,
2416 strlen(numberresult) + 1);
2417 numberresult[0] = '0';
2418 numberresult[1] = 'x';
2419 numprinted += 2;
2420 }
2421 n += numprinted;
2422 numberresult += (numprinted + 1);
2423 assert(*(numberresult - 1) == '\0');
2424 assert(*(numberresult - 2) != '\0');
2425 assert(numprinted >= 0);
2426 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002427 break;
2428 case 's':
2429 {
2430 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002431 const char *s = va_arg(count, const char*);
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002432 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002433 if (!str)
2434 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002435 /* since PyUnicode_DecodeUTF8 returns already flexible
2436 unicode objects, there is no need to call ready on them */
2437 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002438 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002439 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002440 /* Remember the str and switch to the next slot */
2441 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002442 break;
2443 }
2444 case 'U':
2445 {
2446 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002447 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002448 if (PyUnicode_READY(obj) == -1)
2449 goto fail;
2450 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002451 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002452 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002453 break;
2454 }
2455 case 'V':
2456 {
2457 PyObject *obj = va_arg(count, PyObject *);
2458 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002459 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002460 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002461 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002462 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002463 if (PyUnicode_READY(obj) == -1)
2464 goto fail;
2465 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002466 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002467 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002468 *callresult++ = NULL;
2469 }
2470 else {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002471 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002472 if (!str_obj)
2473 goto fail;
Victor Stinnere1335c72011-10-04 20:53:03 +02002474 if (PyUnicode_READY(str_obj)) {
2475 Py_DECREF(str_obj);
2476 goto fail;
2477 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002478 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002479 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002480 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002481 *callresult++ = str_obj;
2482 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002483 break;
2484 }
2485 case 'S':
2486 {
2487 PyObject *obj = va_arg(count, PyObject *);
2488 PyObject *str;
2489 assert(obj);
2490 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002491 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002492 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002493 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002494 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002495 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002496 /* Remember the str and switch to the next slot */
2497 *callresult++ = str;
2498 break;
2499 }
2500 case 'R':
2501 {
2502 PyObject *obj = va_arg(count, PyObject *);
2503 PyObject *repr;
2504 assert(obj);
2505 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002506 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002507 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002508 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002509 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002510 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002511 /* Remember the repr and switch to the next slot */
2512 *callresult++ = repr;
2513 break;
2514 }
2515 case 'A':
2516 {
2517 PyObject *obj = va_arg(count, PyObject *);
2518 PyObject *ascii;
2519 assert(obj);
2520 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002521 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002522 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002523 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002524 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002525 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002526 /* Remember the repr and switch to the next slot */
2527 *callresult++ = ascii;
2528 break;
2529 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002530 default:
2531 /* if we stumble upon an unknown
2532 formatting code, copy the rest of
2533 the format string to the output
2534 string. (we cannot just skip the
2535 code, since there's no way to know
2536 what's in the argument list) */
2537 n += strlen(p);
2538 goto expand;
2539 }
2540 } else
2541 n++;
2542 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002543 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002544 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002545 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002546 we don't have to resize the string.
2547 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002548 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002549 if (!string)
2550 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002551 kind = PyUnicode_KIND(string);
2552 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002553 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002554 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002555
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002556 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002557 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002558 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002559
2560 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002561 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2562 /* checking for == because the last argument could be a empty
2563 string, which causes i to point to end, the assert at the end of
2564 the loop */
2565 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002566
Benjamin Peterson14339b62009-01-31 16:36:08 +00002567 switch (*f) {
2568 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002569 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002570 const int ordinal = va_arg(vargs, int);
2571 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002572 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002573 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002574 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002575 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002576 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002577 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002578 case 'p':
2579 /* unused, since we already have the result */
2580 if (*f == 'p')
2581 (void) va_arg(vargs, void *);
2582 else
2583 (void) va_arg(vargs, int);
2584 /* extract the result from numberresults and append. */
2585 for (; *numberresult; ++i, ++numberresult)
2586 PyUnicode_WRITE(kind, data, i, *numberresult);
2587 /* skip over the separating '\0' */
2588 assert(*numberresult == '\0');
2589 numberresult++;
2590 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002591 break;
2592 case 's':
2593 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002594 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002595 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002596 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002597 size = PyUnicode_GET_LENGTH(*callresult);
2598 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002599 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002600 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002601 /* We're done with the unicode()/repr() => forget it */
2602 Py_DECREF(*callresult);
2603 /* switch to next unicode()/repr() result */
2604 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002605 break;
2606 }
2607 case 'U':
2608 {
2609 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002610 Py_ssize_t size;
2611 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2612 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002613 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002614 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002615 break;
2616 }
2617 case 'V':
2618 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002619 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002620 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002621 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002622 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002623 size = PyUnicode_GET_LENGTH(obj);
2624 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002625 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002626 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002627 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002628 size = PyUnicode_GET_LENGTH(*callresult);
2629 assert(PyUnicode_KIND(*callresult) <=
2630 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002631 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002632 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002633 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002634 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002635 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002636 break;
2637 }
2638 case 'S':
2639 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002640 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002641 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002642 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002643 /* unused, since we already have the result */
2644 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002645 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002646 copy_characters(string, i, *callresult, 0, size);
2647 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002648 /* We're done with the unicode()/repr() => forget it */
2649 Py_DECREF(*callresult);
2650 /* switch to next unicode()/repr() result */
2651 ++callresult;
2652 break;
2653 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002654 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002655 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002656 break;
2657 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002658 for (; *p; ++p, ++i)
2659 PyUnicode_WRITE(kind, data, i, *p);
2660 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002661 goto end;
2662 }
Victor Stinner1205f272010-09-11 00:54:47 +00002663 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002664 else {
2665 assert(i < PyUnicode_GET_LENGTH(string));
2666 PyUnicode_WRITE(kind, data, i++, *f);
2667 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002668 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002669 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002670
Benjamin Peterson29060642009-01-31 22:14:21 +00002671 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002672 if (callresults)
2673 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002674 if (numberresults)
2675 PyObject_Free(numberresults);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002676 return unicode_result(string);
Benjamin Peterson29060642009-01-31 22:14:21 +00002677 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002678 if (callresults) {
2679 PyObject **callresult2 = callresults;
2680 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002681 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002682 ++callresult2;
2683 }
2684 PyObject_Free(callresults);
2685 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002686 if (numberresults)
2687 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002688 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002689}
2690
Walter Dörwaldd2034312007-05-18 16:29:38 +00002691PyObject *
2692PyUnicode_FromFormat(const char *format, ...)
2693{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002694 PyObject* ret;
2695 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002696
2697#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002698 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002699#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002700 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002701#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002702 ret = PyUnicode_FromFormatV(format, vargs);
2703 va_end(vargs);
2704 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002705}
2706
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002707#ifdef HAVE_WCHAR_H
2708
Victor Stinner5593d8a2010-10-02 11:11:27 +00002709/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2710 convert a Unicode object to a wide character string.
2711
Victor Stinnerd88d9832011-09-06 02:00:05 +02002712 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002713 character) required to convert the unicode object. Ignore size argument.
2714
Victor Stinnerd88d9832011-09-06 02:00:05 +02002715 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002716 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002717 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002718static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002719unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002720 wchar_t *w,
2721 Py_ssize_t size)
2722{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002723 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002724 const wchar_t *wstr;
2725
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002726 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002727 if (wstr == NULL)
2728 return -1;
2729
Victor Stinner5593d8a2010-10-02 11:11:27 +00002730 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002731 if (size > res)
2732 size = res + 1;
2733 else
2734 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002735 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002736 return res;
2737 }
2738 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002739 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002740}
2741
2742Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002743PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002744 wchar_t *w,
2745 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002746{
2747 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002748 PyErr_BadInternalCall();
2749 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002750 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002751 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002752}
2753
Victor Stinner137c34c2010-09-29 10:25:54 +00002754wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002755PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002756 Py_ssize_t *size)
2757{
2758 wchar_t* buffer;
2759 Py_ssize_t buflen;
2760
2761 if (unicode == NULL) {
2762 PyErr_BadInternalCall();
2763 return NULL;
2764 }
2765
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002766 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002767 if (buflen == -1)
2768 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002769 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002770 PyErr_NoMemory();
2771 return NULL;
2772 }
2773
Victor Stinner137c34c2010-09-29 10:25:54 +00002774 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2775 if (buffer == NULL) {
2776 PyErr_NoMemory();
2777 return NULL;
2778 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002779 buflen = unicode_aswidechar(unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002780 if (buflen == -1)
2781 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002782 if (size != NULL)
2783 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002784 return buffer;
2785}
2786
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002787#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002788
Alexander Belopolsky40018472011-02-26 01:02:56 +00002789PyObject *
2790PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002791{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002792 PyObject *v;
Victor Stinner8faf8212011-12-08 22:14:11 +01002793 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002794 PyErr_SetString(PyExc_ValueError,
2795 "chr() arg not in range(0x110000)");
2796 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002797 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002798
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002799 if (ordinal < 256)
2800 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002801
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002802 v = PyUnicode_New(1, ordinal);
2803 if (v == NULL)
2804 return NULL;
2805 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002806 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002807 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002808}
2809
Alexander Belopolsky40018472011-02-26 01:02:56 +00002810PyObject *
2811PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002812{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002813 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002814 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002815 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002816 if (PyUnicode_READY(obj))
2817 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002818 Py_INCREF(obj);
2819 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002820 }
2821 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002822 /* For a Unicode subtype that's not a Unicode object,
2823 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002824 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002825 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002826 PyErr_Format(PyExc_TypeError,
2827 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002828 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002829 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002830}
2831
Alexander Belopolsky40018472011-02-26 01:02:56 +00002832PyObject *
2833PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002834 const char *encoding,
2835 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002836{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002837 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002838 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002839
Guido van Rossumd57fd912000-03-10 22:53:23 +00002840 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002841 PyErr_BadInternalCall();
2842 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002843 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002844
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002845 /* Decoding bytes objects is the most common case and should be fast */
2846 if (PyBytes_Check(obj)) {
2847 if (PyBytes_GET_SIZE(obj) == 0) {
2848 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002849 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002850 }
2851 else {
2852 v = PyUnicode_Decode(
2853 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2854 encoding, errors);
2855 }
2856 return v;
2857 }
2858
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002859 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002860 PyErr_SetString(PyExc_TypeError,
2861 "decoding str is not supported");
2862 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002863 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002864
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002865 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2866 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2867 PyErr_Format(PyExc_TypeError,
2868 "coercing to str: need bytes, bytearray "
2869 "or buffer-like object, %.80s found",
2870 Py_TYPE(obj)->tp_name);
2871 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002872 }
Tim Petersced69f82003-09-16 20:30:58 +00002873
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002874 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002875 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002876 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002877 }
Tim Petersced69f82003-09-16 20:30:58 +00002878 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002879 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002880
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002881 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002882 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002883}
2884
Victor Stinner600d3be2010-06-10 12:00:55 +00002885/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002886 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2887 1 on success. */
2888static int
2889normalize_encoding(const char *encoding,
2890 char *lower,
2891 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002892{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002893 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002894 char *l;
2895 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002896
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002897 if (encoding == NULL) {
2898 strcpy(lower, "utf-8");
2899 return 1;
2900 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002901 e = encoding;
2902 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002903 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002904 while (*e) {
2905 if (l == l_end)
2906 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002907 if (Py_ISUPPER(*e)) {
2908 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002909 }
2910 else if (*e == '_') {
2911 *l++ = '-';
2912 e++;
2913 }
2914 else {
2915 *l++ = *e++;
2916 }
2917 }
2918 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002919 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002920}
2921
Alexander Belopolsky40018472011-02-26 01:02:56 +00002922PyObject *
2923PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002924 Py_ssize_t size,
2925 const char *encoding,
2926 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002927{
2928 PyObject *buffer = NULL, *unicode;
2929 Py_buffer info;
2930 char lower[11]; /* Enough for any encoding shortcut */
2931
Fred Drakee4315f52000-05-09 19:53:39 +00002932 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002933 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002934 if ((strcmp(lower, "utf-8") == 0) ||
2935 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002936 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00002937 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002938 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002939 (strcmp(lower, "iso-8859-1") == 0))
2940 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002941#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002942 else if (strcmp(lower, "mbcs") == 0)
2943 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002944#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002945 else if (strcmp(lower, "ascii") == 0)
2946 return PyUnicode_DecodeASCII(s, size, errors);
2947 else if (strcmp(lower, "utf-16") == 0)
2948 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2949 else if (strcmp(lower, "utf-32") == 0)
2950 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2951 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002952
2953 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002954 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002955 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002956 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002957 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002958 if (buffer == NULL)
2959 goto onError;
2960 unicode = PyCodec_Decode(buffer, encoding, errors);
2961 if (unicode == NULL)
2962 goto onError;
2963 if (!PyUnicode_Check(unicode)) {
2964 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002965 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002966 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002967 Py_DECREF(unicode);
2968 goto onError;
2969 }
2970 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002971 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00002972
Benjamin Peterson29060642009-01-31 22:14:21 +00002973 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002974 Py_XDECREF(buffer);
2975 return NULL;
2976}
2977
Alexander Belopolsky40018472011-02-26 01:02:56 +00002978PyObject *
2979PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002980 const char *encoding,
2981 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002982{
2983 PyObject *v;
2984
2985 if (!PyUnicode_Check(unicode)) {
2986 PyErr_BadArgument();
2987 goto onError;
2988 }
2989
2990 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002991 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002992
2993 /* Decode via the codec registry */
2994 v = PyCodec_Decode(unicode, encoding, errors);
2995 if (v == NULL)
2996 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002997 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002998
Benjamin Peterson29060642009-01-31 22:14:21 +00002999 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003000 return NULL;
3001}
3002
Alexander Belopolsky40018472011-02-26 01:02:56 +00003003PyObject *
3004PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003005 const char *encoding,
3006 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003007{
3008 PyObject *v;
3009
3010 if (!PyUnicode_Check(unicode)) {
3011 PyErr_BadArgument();
3012 goto onError;
3013 }
3014
3015 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003016 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003017
3018 /* Decode via the codec registry */
3019 v = PyCodec_Decode(unicode, encoding, errors);
3020 if (v == NULL)
3021 goto onError;
3022 if (!PyUnicode_Check(v)) {
3023 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003024 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003025 Py_TYPE(v)->tp_name);
3026 Py_DECREF(v);
3027 goto onError;
3028 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003029 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003030
Benjamin Peterson29060642009-01-31 22:14:21 +00003031 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003032 return NULL;
3033}
3034
Alexander Belopolsky40018472011-02-26 01:02:56 +00003035PyObject *
3036PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003037 Py_ssize_t size,
3038 const char *encoding,
3039 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003040{
3041 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003042
Guido van Rossumd57fd912000-03-10 22:53:23 +00003043 unicode = PyUnicode_FromUnicode(s, size);
3044 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003045 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003046 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3047 Py_DECREF(unicode);
3048 return v;
3049}
3050
Alexander Belopolsky40018472011-02-26 01:02:56 +00003051PyObject *
3052PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003053 const char *encoding,
3054 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003055{
3056 PyObject *v;
3057
3058 if (!PyUnicode_Check(unicode)) {
3059 PyErr_BadArgument();
3060 goto onError;
3061 }
3062
3063 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003064 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003065
3066 /* Encode via the codec registry */
3067 v = PyCodec_Encode(unicode, encoding, errors);
3068 if (v == NULL)
3069 goto onError;
3070 return v;
3071
Benjamin Peterson29060642009-01-31 22:14:21 +00003072 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003073 return NULL;
3074}
3075
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003076static size_t
3077wcstombs_errorpos(const wchar_t *wstr)
3078{
3079 size_t len;
3080#if SIZEOF_WCHAR_T == 2
3081 wchar_t buf[3];
3082#else
3083 wchar_t buf[2];
3084#endif
3085 char outbuf[MB_LEN_MAX];
3086 const wchar_t *start, *previous;
3087 int save_errno;
3088
3089 save_errno = errno;
3090#if SIZEOF_WCHAR_T == 2
3091 buf[2] = 0;
3092#else
3093 buf[1] = 0;
3094#endif
3095 start = wstr;
3096 while (*wstr != L'\0')
3097 {
3098 previous = wstr;
3099#if SIZEOF_WCHAR_T == 2
3100 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3101 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3102 {
3103 buf[0] = wstr[0];
3104 buf[1] = wstr[1];
3105 wstr += 2;
3106 }
3107 else {
3108 buf[0] = *wstr;
3109 buf[1] = 0;
3110 wstr++;
3111 }
3112#else
3113 buf[0] = *wstr;
3114 wstr++;
3115#endif
3116 len = wcstombs(outbuf, buf, sizeof(outbuf));
3117 if (len == (size_t)-1) {
3118 errno = save_errno;
3119 return previous - start;
3120 }
3121 }
3122
3123 /* failed to find the unencodable character */
3124 errno = save_errno;
3125 return 0;
3126}
3127
3128PyObject *
3129PyUnicode_EncodeLocale(PyObject *unicode, int surrogateescape)
3130{
3131 Py_ssize_t wlen, wlen2;
3132 wchar_t *wstr;
3133 PyObject *bytes = NULL;
3134 char *errmsg;
3135 PyObject *exc;
3136 size_t error_pos;
3137
3138 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3139 if (wstr == NULL)
3140 return NULL;
3141
3142 wlen2 = wcslen(wstr);
3143 if (wlen2 != wlen) {
3144 PyMem_Free(wstr);
3145 PyErr_SetString(PyExc_TypeError, "embedded null character");
3146 return NULL;
3147 }
3148
3149 if (surrogateescape) {
3150 /* locale encoding with surrogateescape */
3151 char *str;
3152
3153 str = _Py_wchar2char(wstr, &error_pos);
3154 if (str == NULL) {
3155 if (error_pos == (size_t)-1) {
3156 PyErr_NoMemory();
3157 PyMem_Free(wstr);
3158 return NULL;
3159 }
3160 else {
3161 goto encode_error;
3162 }
3163 }
3164 PyMem_Free(wstr);
3165
3166 bytes = PyBytes_FromString(str);
3167 PyMem_Free(str);
3168 }
3169 else {
3170 size_t len, len2;
3171
3172 len = wcstombs(NULL, wstr, 0);
3173 if (len == (size_t)-1) {
3174 error_pos = wcstombs_errorpos(wstr);
3175 goto encode_error;
3176 }
3177
3178 bytes = PyBytes_FromStringAndSize(NULL, len);
3179 if (bytes == NULL) {
3180 PyMem_Free(wstr);
3181 return NULL;
3182 }
3183
3184 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3185 if (len2 == (size_t)-1 || len2 > len) {
3186 error_pos = wcstombs_errorpos(wstr);
3187 goto encode_error;
3188 }
3189 PyMem_Free(wstr);
3190 }
3191 return bytes;
3192
3193encode_error:
3194 errmsg = strerror(errno);
3195 assert(errmsg != NULL);
3196 if (errmsg == NULL)
3197 errmsg = "wcstombs() encountered an unencodable wide character";
3198 PyMem_Free(wstr);
3199 Py_XDECREF(bytes);
3200
3201 exc = NULL;
3202 raise_encode_exception(&exc,
3203 "locale", unicode,
3204 error_pos, error_pos+1,
3205 errmsg);
3206 Py_XDECREF(exc);
3207 return NULL;
3208}
3209
Victor Stinnerad158722010-10-27 00:25:46 +00003210PyObject *
3211PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003212{
Victor Stinner99b95382011-07-04 14:23:54 +02003213#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003214 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003215#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003216 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003217#else
Victor Stinner793b5312011-04-27 00:24:21 +02003218 PyInterpreterState *interp = PyThreadState_GET()->interp;
3219 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3220 cannot use it to encode and decode filenames before it is loaded. Load
3221 the Python codec requires to encode at least its own filename. Use the C
3222 version of the locale codec until the codec registry is initialized and
3223 the Python codec is loaded.
3224
3225 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3226 cannot only rely on it: check also interp->fscodec_initialized for
3227 subinterpreters. */
3228 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003229 return PyUnicode_AsEncodedString(unicode,
3230 Py_FileSystemDefaultEncoding,
3231 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003232 }
3233 else {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003234 return PyUnicode_EncodeLocale(unicode, 1);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003235 }
Victor Stinnerad158722010-10-27 00:25:46 +00003236#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003237}
3238
Alexander Belopolsky40018472011-02-26 01:02:56 +00003239PyObject *
3240PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003241 const char *encoding,
3242 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003243{
3244 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003245 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003246
Guido van Rossumd57fd912000-03-10 22:53:23 +00003247 if (!PyUnicode_Check(unicode)) {
3248 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003249 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003250 }
Fred Drakee4315f52000-05-09 19:53:39 +00003251
Fred Drakee4315f52000-05-09 19:53:39 +00003252 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003253 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003254 if ((strcmp(lower, "utf-8") == 0) ||
3255 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003256 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003257 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003258 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003259 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003260 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003261 }
Victor Stinner37296e82010-06-10 13:36:23 +00003262 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003263 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003264 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003265 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003266#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003267 else if (strcmp(lower, "mbcs") == 0)
3268 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003269#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003270 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003271 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003272 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003273
3274 /* Encode via the codec registry */
3275 v = PyCodec_Encode(unicode, encoding, errors);
3276 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003277 return NULL;
3278
3279 /* The normal path */
3280 if (PyBytes_Check(v))
3281 return v;
3282
3283 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003284 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003285 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003286 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003287
3288 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3289 "encoder %s returned bytearray instead of bytes",
3290 encoding);
3291 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003292 Py_DECREF(v);
3293 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003294 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003295
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003296 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3297 Py_DECREF(v);
3298 return b;
3299 }
3300
3301 PyErr_Format(PyExc_TypeError,
3302 "encoder did not return a bytes object (type=%.400s)",
3303 Py_TYPE(v)->tp_name);
3304 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003305 return NULL;
3306}
3307
Alexander Belopolsky40018472011-02-26 01:02:56 +00003308PyObject *
3309PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003310 const char *encoding,
3311 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003312{
3313 PyObject *v;
3314
3315 if (!PyUnicode_Check(unicode)) {
3316 PyErr_BadArgument();
3317 goto onError;
3318 }
3319
3320 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003321 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003322
3323 /* Encode via the codec registry */
3324 v = PyCodec_Encode(unicode, encoding, errors);
3325 if (v == NULL)
3326 goto onError;
3327 if (!PyUnicode_Check(v)) {
3328 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003329 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003330 Py_TYPE(v)->tp_name);
3331 Py_DECREF(v);
3332 goto onError;
3333 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003334 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003335
Benjamin Peterson29060642009-01-31 22:14:21 +00003336 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003337 return NULL;
3338}
3339
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003340PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003341PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3342 int surrogateescape)
3343{
3344 wchar_t smallbuf[256];
3345 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3346 wchar_t *wstr;
3347 size_t wlen, wlen2;
3348 PyObject *unicode;
3349
3350 if (str[len] != '\0' || len != strlen(str)) {
3351 PyErr_SetString(PyExc_TypeError, "embedded null character");
3352 return NULL;
3353 }
3354
3355 if (surrogateescape)
3356 {
3357 wstr = _Py_char2wchar(str, &wlen);
3358 if (wstr == NULL) {
3359 if (wlen == (size_t)-1)
3360 PyErr_NoMemory();
3361 else
3362 PyErr_SetFromErrno(PyExc_OSError);
3363 return NULL;
3364 }
3365
3366 unicode = PyUnicode_FromWideChar(wstr, wlen);
3367 PyMem_Free(wstr);
3368 }
3369 else {
3370#ifndef HAVE_BROKEN_MBSTOWCS
3371 wlen = mbstowcs(NULL, str, 0);
3372#else
3373 wlen = len;
3374#endif
3375 if (wlen == (size_t)-1) {
3376 PyErr_SetFromErrno(PyExc_OSError);
3377 return NULL;
3378 }
3379 if (wlen+1 <= smallbuf_len) {
3380 wstr = smallbuf;
3381 }
3382 else {
3383 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3384 return PyErr_NoMemory();
3385
3386 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3387 if (!wstr)
3388 return PyErr_NoMemory();
3389 }
3390
3391 /* This shouldn't fail now */
3392 wlen2 = mbstowcs(wstr, str, wlen+1);
3393 if (wlen2 == (size_t)-1) {
3394 if (wstr != smallbuf)
3395 PyMem_Free(wstr);
3396 PyErr_SetFromErrno(PyExc_OSError);
3397 return NULL;
3398 }
3399#ifdef HAVE_BROKEN_MBSTOWCS
3400 assert(wlen2 == wlen);
3401#endif
3402 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3403 if (wstr != smallbuf)
3404 PyMem_Free(wstr);
3405 }
3406 return unicode;
3407}
3408
3409PyObject*
3410PyUnicode_DecodeLocale(const char *str, int surrogateescape)
3411{
3412 Py_ssize_t size = (Py_ssize_t)strlen(str);
3413 return PyUnicode_DecodeLocaleAndSize(str, size, surrogateescape);
3414}
3415
3416
3417PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003418PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003419 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003420 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3421}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003422
Christian Heimes5894ba72007-11-04 11:43:14 +00003423PyObject*
3424PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3425{
Victor Stinner99b95382011-07-04 14:23:54 +02003426#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003427 return PyUnicode_DecodeMBCS(s, size, NULL);
3428#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003429 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003430#else
Victor Stinner793b5312011-04-27 00:24:21 +02003431 PyInterpreterState *interp = PyThreadState_GET()->interp;
3432 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3433 cannot use it to encode and decode filenames before it is loaded. Load
3434 the Python codec requires to encode at least its own filename. Use the C
3435 version of the locale codec until the codec registry is initialized and
3436 the Python codec is loaded.
3437
3438 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3439 cannot only rely on it: check also interp->fscodec_initialized for
3440 subinterpreters. */
3441 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003442 return PyUnicode_Decode(s, size,
3443 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003444 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003445 }
3446 else {
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003447 return PyUnicode_DecodeLocaleAndSize(s, size, 1);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003448 }
Victor Stinnerad158722010-10-27 00:25:46 +00003449#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003450}
3451
Martin v. Löwis011e8422009-05-05 04:43:17 +00003452
3453int
3454PyUnicode_FSConverter(PyObject* arg, void* addr)
3455{
3456 PyObject *output = NULL;
3457 Py_ssize_t size;
3458 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003459 if (arg == NULL) {
3460 Py_DECREF(*(PyObject**)addr);
3461 return 1;
3462 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003463 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003464 output = arg;
3465 Py_INCREF(output);
3466 }
3467 else {
3468 arg = PyUnicode_FromObject(arg);
3469 if (!arg)
3470 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003471 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003472 Py_DECREF(arg);
3473 if (!output)
3474 return 0;
3475 if (!PyBytes_Check(output)) {
3476 Py_DECREF(output);
3477 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3478 return 0;
3479 }
3480 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003481 size = PyBytes_GET_SIZE(output);
3482 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003483 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003484 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003485 Py_DECREF(output);
3486 return 0;
3487 }
3488 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003489 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003490}
3491
3492
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003493int
3494PyUnicode_FSDecoder(PyObject* arg, void* addr)
3495{
3496 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003497 if (arg == NULL) {
3498 Py_DECREF(*(PyObject**)addr);
3499 return 1;
3500 }
3501 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003502 if (PyUnicode_READY(arg))
3503 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003504 output = arg;
3505 Py_INCREF(output);
3506 }
3507 else {
3508 arg = PyBytes_FromObject(arg);
3509 if (!arg)
3510 return 0;
3511 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3512 PyBytes_GET_SIZE(arg));
3513 Py_DECREF(arg);
3514 if (!output)
3515 return 0;
3516 if (!PyUnicode_Check(output)) {
3517 Py_DECREF(output);
3518 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3519 return 0;
3520 }
3521 }
Victor Stinner065836e2011-10-27 01:56:33 +02003522 if (PyUnicode_READY(output) < 0) {
3523 Py_DECREF(output);
3524 return 0;
3525 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003526 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003527 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003528 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3529 Py_DECREF(output);
3530 return 0;
3531 }
3532 *(PyObject**)addr = output;
3533 return Py_CLEANUP_SUPPORTED;
3534}
3535
3536
Martin v. Löwis5b222132007-06-10 09:51:05 +00003537char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003538PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003539{
Christian Heimesf3863112007-11-22 07:46:41 +00003540 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003541
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003542 if (!PyUnicode_Check(unicode)) {
3543 PyErr_BadArgument();
3544 return NULL;
3545 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003546 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003547 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003548
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003549 if (PyUnicode_UTF8(unicode) == NULL) {
3550 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003551 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3552 if (bytes == NULL)
3553 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003554 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3555 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003556 Py_DECREF(bytes);
3557 return NULL;
3558 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003559 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3560 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3561 PyBytes_AS_STRING(bytes),
3562 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003563 Py_DECREF(bytes);
3564 }
3565
3566 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003567 *psize = PyUnicode_UTF8_LENGTH(unicode);
3568 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003569}
3570
3571char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003572PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003573{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003574 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3575}
3576
3577#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02003578static int unicode_as_unicode_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003579#endif
3580
3581
3582Py_UNICODE *
3583PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3584{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003585 const unsigned char *one_byte;
3586#if SIZEOF_WCHAR_T == 4
3587 const Py_UCS2 *two_bytes;
3588#else
3589 const Py_UCS4 *four_bytes;
3590 const Py_UCS4 *ucs4_end;
3591 Py_ssize_t num_surrogates;
3592#endif
3593 wchar_t *w;
3594 wchar_t *wchar_end;
3595
3596 if (!PyUnicode_Check(unicode)) {
3597 PyErr_BadArgument();
3598 return NULL;
3599 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003600 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003601 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003602 assert(_PyUnicode_KIND(unicode) != 0);
3603 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003604
3605#ifdef Py_DEBUG
3606 ++unicode_as_unicode_calls;
3607#endif
3608
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003609 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003610#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003611 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3612 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003613 num_surrogates = 0;
3614
3615 for (; four_bytes < ucs4_end; ++four_bytes) {
3616 if (*four_bytes > 0xFFFF)
3617 ++num_surrogates;
3618 }
3619
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003620 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3621 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3622 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003623 PyErr_NoMemory();
3624 return NULL;
3625 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003626 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003627
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003628 w = _PyUnicode_WSTR(unicode);
3629 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3630 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003631 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3632 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003633 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003634 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003635 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3636 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003637 }
3638 else
3639 *w = *four_bytes;
3640
3641 if (w > wchar_end) {
3642 assert(0 && "Miscalculated string end");
3643 }
3644 }
3645 *w = 0;
3646#else
3647 /* sizeof(wchar_t) == 4 */
3648 Py_FatalError("Impossible unicode object state, wstr and str "
3649 "should share memory already.");
3650 return NULL;
3651#endif
3652 }
3653 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003654 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3655 (_PyUnicode_LENGTH(unicode) + 1));
3656 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003657 PyErr_NoMemory();
3658 return NULL;
3659 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003660 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3661 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3662 w = _PyUnicode_WSTR(unicode);
3663 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003664
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003665 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3666 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003667 for (; w < wchar_end; ++one_byte, ++w)
3668 *w = *one_byte;
3669 /* null-terminate the wstr */
3670 *w = 0;
3671 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003672 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003673#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003674 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003675 for (; w < wchar_end; ++two_bytes, ++w)
3676 *w = *two_bytes;
3677 /* null-terminate the wstr */
3678 *w = 0;
3679#else
3680 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003681 PyObject_FREE(_PyUnicode_WSTR(unicode));
3682 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003683 Py_FatalError("Impossible unicode object state, wstr "
3684 "and str should share memory already.");
3685 return NULL;
3686#endif
3687 }
3688 else {
3689 assert(0 && "This should never happen.");
3690 }
3691 }
3692 }
3693 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003694 *size = PyUnicode_WSTR_LENGTH(unicode);
3695 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003696}
3697
Alexander Belopolsky40018472011-02-26 01:02:56 +00003698Py_UNICODE *
3699PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003700{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003701 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003702}
3703
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003704
Alexander Belopolsky40018472011-02-26 01:02:56 +00003705Py_ssize_t
3706PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003707{
3708 if (!PyUnicode_Check(unicode)) {
3709 PyErr_BadArgument();
3710 goto onError;
3711 }
3712 return PyUnicode_GET_SIZE(unicode);
3713
Benjamin Peterson29060642009-01-31 22:14:21 +00003714 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003715 return -1;
3716}
3717
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003718Py_ssize_t
3719PyUnicode_GetLength(PyObject *unicode)
3720{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003721 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003722 PyErr_BadArgument();
3723 return -1;
3724 }
3725
3726 return PyUnicode_GET_LENGTH(unicode);
3727}
3728
3729Py_UCS4
3730PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3731{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003732 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3733 PyErr_BadArgument();
3734 return (Py_UCS4)-1;
3735 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003736 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003737 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003738 return (Py_UCS4)-1;
3739 }
3740 return PyUnicode_READ_CHAR(unicode, index);
3741}
3742
3743int
3744PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3745{
3746 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003747 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003748 return -1;
3749 }
Victor Stinner488fa492011-12-12 00:01:39 +01003750 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003751 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003752 PyErr_SetString(PyExc_IndexError, "string index out of range");
3753 return -1;
3754 }
Victor Stinner488fa492011-12-12 00:01:39 +01003755 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003756 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003757 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3758 index, ch);
3759 return 0;
3760}
3761
Alexander Belopolsky40018472011-02-26 01:02:56 +00003762const char *
3763PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003764{
Victor Stinner42cb4622010-09-01 19:39:01 +00003765 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003766}
3767
Victor Stinner554f3f02010-06-16 23:33:54 +00003768/* create or adjust a UnicodeDecodeError */
3769static void
3770make_decode_exception(PyObject **exceptionObject,
3771 const char *encoding,
3772 const char *input, Py_ssize_t length,
3773 Py_ssize_t startpos, Py_ssize_t endpos,
3774 const char *reason)
3775{
3776 if (*exceptionObject == NULL) {
3777 *exceptionObject = PyUnicodeDecodeError_Create(
3778 encoding, input, length, startpos, endpos, reason);
3779 }
3780 else {
3781 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3782 goto onError;
3783 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3784 goto onError;
3785 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3786 goto onError;
3787 }
3788 return;
3789
3790onError:
3791 Py_DECREF(*exceptionObject);
3792 *exceptionObject = NULL;
3793}
3794
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003795/* error handling callback helper:
3796 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003797 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003798 and adjust various state variables.
3799 return 0 on success, -1 on error
3800*/
3801
Alexander Belopolsky40018472011-02-26 01:02:56 +00003802static int
3803unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003804 const char *encoding, const char *reason,
3805 const char **input, const char **inend, Py_ssize_t *startinpos,
3806 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003807 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003808{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003809 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003810
3811 PyObject *restuple = NULL;
3812 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01003813 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003814 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003815 Py_ssize_t requiredsize;
3816 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003817 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003818 int res = -1;
3819
Victor Stinner596a6c42011-11-09 00:02:18 +01003820 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
3821 outsize = PyUnicode_GET_LENGTH(*output);
3822 else
3823 outsize = _PyUnicode_WSTR_LENGTH(*output);
3824
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003825 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003826 *errorHandler = PyCodec_LookupError(errors);
3827 if (*errorHandler == NULL)
3828 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003829 }
3830
Victor Stinner554f3f02010-06-16 23:33:54 +00003831 make_decode_exception(exceptionObject,
3832 encoding,
3833 *input, *inend - *input,
3834 *startinpos, *endinpos,
3835 reason);
3836 if (*exceptionObject == NULL)
3837 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003838
3839 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3840 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003841 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003842 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003843 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003844 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003845 }
3846 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003847 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003848 if (PyUnicode_READY(repunicode) < 0)
3849 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003850
3851 /* Copy back the bytes variables, which might have been modified by the
3852 callback */
3853 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3854 if (!inputobj)
3855 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003856 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003857 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003858 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003859 *input = PyBytes_AS_STRING(inputobj);
3860 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003861 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003862 /* we can DECREF safely, as the exception has another reference,
3863 so the object won't go away. */
3864 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003865
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003866 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003867 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003868 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003869 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3870 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003871 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003872
Victor Stinner596a6c42011-11-09 00:02:18 +01003873 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
3874 /* need more space? (at least enough for what we
3875 have+the replacement+the rest of the string (starting
3876 at the new input position), so we won't have to check space
3877 when there are no errors in the rest of the string) */
3878 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
3879 requiredsize = *outpos + replen + insize-newpos;
3880 if (requiredsize > outsize) {
3881 if (requiredsize<2*outsize)
3882 requiredsize = 2*outsize;
3883 if (unicode_resize(output, requiredsize) < 0)
3884 goto onError;
3885 }
3886 if (unicode_widen(output, PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003887 goto onError;
Victor Stinner596a6c42011-11-09 00:02:18 +01003888 copy_characters(*output, *outpos, repunicode, 0, replen);
3889 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003890 }
Victor Stinner596a6c42011-11-09 00:02:18 +01003891 else {
3892 wchar_t *repwstr;
3893 Py_ssize_t repwlen;
3894 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
3895 if (repwstr == NULL)
3896 goto onError;
3897 /* need more space? (at least enough for what we
3898 have+the replacement+the rest of the string (starting
3899 at the new input position), so we won't have to check space
3900 when there are no errors in the rest of the string) */
3901 requiredsize = *outpos + repwlen + insize-newpos;
3902 if (requiredsize > outsize) {
3903 if (requiredsize < 2*outsize)
3904 requiredsize = 2*outsize;
3905 if (unicode_resize(output, requiredsize) < 0)
3906 goto onError;
3907 }
3908 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
3909 *outpos += repwlen;
3910 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003911 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003912 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003913
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003914 /* we made it! */
3915 res = 0;
3916
Benjamin Peterson29060642009-01-31 22:14:21 +00003917 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003918 Py_XDECREF(restuple);
3919 return res;
3920}
3921
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003922/* --- UTF-7 Codec -------------------------------------------------------- */
3923
Antoine Pitrou244651a2009-05-04 18:56:13 +00003924/* See RFC2152 for details. We encode conservatively and decode liberally. */
3925
3926/* Three simple macros defining base-64. */
3927
3928/* Is c a base-64 character? */
3929
3930#define IS_BASE64(c) \
3931 (((c) >= 'A' && (c) <= 'Z') || \
3932 ((c) >= 'a' && (c) <= 'z') || \
3933 ((c) >= '0' && (c) <= '9') || \
3934 (c) == '+' || (c) == '/')
3935
3936/* given that c is a base-64 character, what is its base-64 value? */
3937
3938#define FROM_BASE64(c) \
3939 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3940 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3941 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3942 (c) == '+' ? 62 : 63)
3943
3944/* What is the base-64 character of the bottom 6 bits of n? */
3945
3946#define TO_BASE64(n) \
3947 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3948
3949/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3950 * decoded as itself. We are permissive on decoding; the only ASCII
3951 * byte not decoding to itself is the + which begins a base64
3952 * string. */
3953
3954#define DECODE_DIRECT(c) \
3955 ((c) <= 127 && (c) != '+')
3956
3957/* The UTF-7 encoder treats ASCII characters differently according to
3958 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3959 * the above). See RFC2152. This array identifies these different
3960 * sets:
3961 * 0 : "Set D"
3962 * alphanumeric and '(),-./:?
3963 * 1 : "Set O"
3964 * !"#$%&*;<=>@[]^_`{|}
3965 * 2 : "whitespace"
3966 * ht nl cr sp
3967 * 3 : special (must be base64 encoded)
3968 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3969 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003970
Tim Petersced69f82003-09-16 20:30:58 +00003971static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003972char utf7_category[128] = {
3973/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3974 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3975/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3976 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3977/* sp ! " # $ % & ' ( ) * + , - . / */
3978 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3979/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3980 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3981/* @ A B C D E F G H I J K L M N O */
3982 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3983/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3984 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3985/* ` a b c d e f g h i j k l m n o */
3986 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3987/* p q r s t u v w x y z { | } ~ del */
3988 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003989};
3990
Antoine Pitrou244651a2009-05-04 18:56:13 +00003991/* ENCODE_DIRECT: this character should be encoded as itself. The
3992 * answer depends on whether we are encoding set O as itself, and also
3993 * on whether we are encoding whitespace as itself. RFC2152 makes it
3994 * clear that the answers to these questions vary between
3995 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003996
Antoine Pitrou244651a2009-05-04 18:56:13 +00003997#define ENCODE_DIRECT(c, directO, directWS) \
3998 ((c) < 128 && (c) > 0 && \
3999 ((utf7_category[(c)] == 0) || \
4000 (directWS && (utf7_category[(c)] == 2)) || \
4001 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004002
Alexander Belopolsky40018472011-02-26 01:02:56 +00004003PyObject *
4004PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004005 Py_ssize_t size,
4006 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004007{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004008 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4009}
4010
Antoine Pitrou244651a2009-05-04 18:56:13 +00004011/* The decoder. The only state we preserve is our read position,
4012 * i.e. how many characters we have consumed. So if we end in the
4013 * middle of a shift sequence we have to back off the read position
4014 * and the output to the beginning of the sequence, otherwise we lose
4015 * all the shift state (seen bits, number of bits seen, high
4016 * surrogate). */
4017
Alexander Belopolsky40018472011-02-26 01:02:56 +00004018PyObject *
4019PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004020 Py_ssize_t size,
4021 const char *errors,
4022 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004023{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004024 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004025 Py_ssize_t startinpos;
4026 Py_ssize_t endinpos;
4027 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004028 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004029 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004030 const char *errmsg = "";
4031 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004032 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004033 unsigned int base64bits = 0;
4034 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004035 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004036 PyObject *errorHandler = NULL;
4037 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004038
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004039 /* Start off assuming it's all ASCII. Widen later as necessary. */
4040 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004041 if (!unicode)
4042 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004043 if (size == 0) {
4044 if (consumed)
4045 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004046 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004047 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004048
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004049 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004050 e = s + size;
4051
4052 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004053 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004054 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004055 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004056
Antoine Pitrou244651a2009-05-04 18:56:13 +00004057 if (inShift) { /* in a base-64 section */
4058 if (IS_BASE64(ch)) { /* consume a base-64 character */
4059 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4060 base64bits += 6;
4061 s++;
4062 if (base64bits >= 16) {
4063 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004064 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004065 base64bits -= 16;
4066 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4067 if (surrogate) {
4068 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004069 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4070 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004071 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
4072 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004073 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004074 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004075 }
4076 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004077 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4078 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004079 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004080 }
4081 }
Victor Stinner551ac952011-11-29 22:58:13 +01004082 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004083 /* first surrogate */
4084 surrogate = outCh;
4085 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004086 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004087 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
4088 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004089 }
4090 }
4091 }
4092 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004093 inShift = 0;
4094 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004095 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004096 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4097 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004098 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004099 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004100 if (base64bits > 0) { /* left-over bits */
4101 if (base64bits >= 6) {
4102 /* We've seen at least one base-64 character */
4103 errmsg = "partial character in shift sequence";
4104 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004105 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004106 else {
4107 /* Some bits remain; they should be zero */
4108 if (base64buffer != 0) {
4109 errmsg = "non-zero padding bits in shift sequence";
4110 goto utf7Error;
4111 }
4112 }
4113 }
4114 if (ch != '-') {
4115 /* '-' is absorbed; other terminating
4116 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004117 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4118 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004119 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004120 }
4121 }
4122 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004123 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004124 s++; /* consume '+' */
4125 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004126 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004127 if (unicode_putchar(&unicode, &outpos, '+') < 0)
4128 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004129 }
4130 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004131 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004132 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004133 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004134 }
4135 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004136 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004137 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4138 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004139 s++;
4140 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004141 else {
4142 startinpos = s-starts;
4143 s++;
4144 errmsg = "unexpected special character";
4145 goto utf7Error;
4146 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004147 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004148utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004149 endinpos = s-starts;
4150 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00004151 errors, &errorHandler,
4152 "utf7", errmsg,
4153 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004154 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004155 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004156 }
4157
Antoine Pitrou244651a2009-05-04 18:56:13 +00004158 /* end of string */
4159
4160 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4161 /* if we're in an inconsistent state, that's an error */
4162 if (surrogate ||
4163 (base64bits >= 6) ||
4164 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004165 endinpos = size;
4166 if (unicode_decode_call_errorhandler(
4167 errors, &errorHandler,
4168 "utf7", "unterminated shift sequence",
4169 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004170 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004171 goto onError;
4172 if (s < e)
4173 goto restart;
4174 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004175 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004176
4177 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004178 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004179 if (inShift) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004180 outpos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004181 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004182 }
4183 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004184 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004185 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004186 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004187
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004188 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004189 goto onError;
4190
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004191 Py_XDECREF(errorHandler);
4192 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004193 return unicode_result(unicode);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004194
Benjamin Peterson29060642009-01-31 22:14:21 +00004195 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004196 Py_XDECREF(errorHandler);
4197 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004198 Py_DECREF(unicode);
4199 return NULL;
4200}
4201
4202
Alexander Belopolsky40018472011-02-26 01:02:56 +00004203PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004204_PyUnicode_EncodeUTF7(PyObject *str,
4205 int base64SetO,
4206 int base64WhiteSpace,
4207 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004208{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004209 int kind;
4210 void *data;
4211 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004212 PyObject *v;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004213 Py_ssize_t allocated;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004214 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004215 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004216 unsigned int base64bits = 0;
4217 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004218 char * out;
4219 char * start;
4220
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004221 if (PyUnicode_READY(str) < 0)
4222 return NULL;
4223 kind = PyUnicode_KIND(str);
4224 data = PyUnicode_DATA(str);
4225 len = PyUnicode_GET_LENGTH(str);
4226
4227 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004228 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004229
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004230 /* It might be possible to tighten this worst case */
4231 allocated = 8 * len;
4232 if (allocated / 8 != len)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004233 return PyErr_NoMemory();
4234
Antoine Pitrou244651a2009-05-04 18:56:13 +00004235 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004236 if (v == NULL)
4237 return NULL;
4238
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004239 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004240 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004241 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004242
Antoine Pitrou244651a2009-05-04 18:56:13 +00004243 if (inShift) {
4244 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4245 /* shifting out */
4246 if (base64bits) { /* output remaining bits */
4247 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4248 base64buffer = 0;
4249 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004250 }
4251 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004252 /* Characters not in the BASE64 set implicitly unshift the sequence
4253 so no '-' is required, except if the character is itself a '-' */
4254 if (IS_BASE64(ch) || ch == '-') {
4255 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004256 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004257 *out++ = (char) ch;
4258 }
4259 else {
4260 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004261 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004262 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004263 else { /* not in a shift sequence */
4264 if (ch == '+') {
4265 *out++ = '+';
4266 *out++ = '-';
4267 }
4268 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4269 *out++ = (char) ch;
4270 }
4271 else {
4272 *out++ = '+';
4273 inShift = 1;
4274 goto encode_char;
4275 }
4276 }
4277 continue;
4278encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004279 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004280 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004281
Antoine Pitrou244651a2009-05-04 18:56:13 +00004282 /* code first surrogate */
4283 base64bits += 16;
4284 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4285 while (base64bits >= 6) {
4286 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4287 base64bits -= 6;
4288 }
4289 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004290 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004291 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004292 base64bits += 16;
4293 base64buffer = (base64buffer << 16) | ch;
4294 while (base64bits >= 6) {
4295 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4296 base64bits -= 6;
4297 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004298 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004299 if (base64bits)
4300 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4301 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004302 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004303 if (_PyBytes_Resize(&v, out - start) < 0)
4304 return NULL;
4305 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004306}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004307PyObject *
4308PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4309 Py_ssize_t size,
4310 int base64SetO,
4311 int base64WhiteSpace,
4312 const char *errors)
4313{
4314 PyObject *result;
4315 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4316 if (tmp == NULL)
4317 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004318 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004319 base64WhiteSpace, errors);
4320 Py_DECREF(tmp);
4321 return result;
4322}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004323
Antoine Pitrou244651a2009-05-04 18:56:13 +00004324#undef IS_BASE64
4325#undef FROM_BASE64
4326#undef TO_BASE64
4327#undef DECODE_DIRECT
4328#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004329
Guido van Rossumd57fd912000-03-10 22:53:23 +00004330/* --- UTF-8 Codec -------------------------------------------------------- */
4331
Tim Petersced69f82003-09-16 20:30:58 +00004332static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004333char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004334 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4335 illegal prefix. See RFC 3629 for details */
4336 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4337 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004338 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004339 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4340 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4341 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4342 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004343 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4344 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004345 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4346 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004347 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4348 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4349 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4350 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4351 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004352};
4353
Alexander Belopolsky40018472011-02-26 01:02:56 +00004354PyObject *
4355PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004356 Py_ssize_t size,
4357 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004358{
Walter Dörwald69652032004-09-07 20:24:22 +00004359 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4360}
4361
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004362#include "stringlib/ucs1lib.h"
4363#include "stringlib/codecs.h"
4364#include "stringlib/undef.h"
4365
4366#include "stringlib/ucs2lib.h"
4367#include "stringlib/codecs.h"
4368#include "stringlib/undef.h"
4369
4370#include "stringlib/ucs4lib.h"
4371#include "stringlib/codecs.h"
4372#include "stringlib/undef.h"
4373
Antoine Pitrouab868312009-01-10 15:40:25 +00004374/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4375#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4376
4377/* Mask to quickly check whether a C 'long' contains a
4378 non-ASCII, UTF8-encoded char. */
4379#if (SIZEOF_LONG == 8)
4380# define ASCII_CHAR_MASK 0x8080808080808080L
4381#elif (SIZEOF_LONG == 4)
4382# define ASCII_CHAR_MASK 0x80808080L
4383#else
4384# error C 'long' size should be either 4 or 8!
4385#endif
4386
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004387/* Scans a UTF-8 string and returns the maximum character to be expected
4388 and the size of the decoded unicode string.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004389
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004390 This function doesn't check for errors, these checks are performed in
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004391 PyUnicode_DecodeUTF8Stateful.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004392 */
4393static Py_UCS4
Victor Stinnera1d12bb2011-12-11 21:53:09 +01004394utf8_scanner(const unsigned char *p, Py_ssize_t string_size, Py_ssize_t *unicode_size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004395{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004396 Py_ssize_t char_count = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004397 const unsigned char *end = p + string_size;
4398 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004399
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004400 assert(unicode_size != NULL);
4401
4402 /* By having a cascade of independent loops which fallback onto each
4403 other, we minimize the amount of work done in the average loop
4404 iteration, and we also maximize the CPU's ability to predict
4405 branches correctly (because a given condition will have always the
4406 same boolean outcome except perhaps in the last iteration of the
4407 corresponding loop).
4408 In the general case this brings us rather close to decoding
4409 performance pre-PEP 393, despite the two-pass decoding.
4410
4411 Note that the pure ASCII loop is not duplicated once a non-ASCII
4412 character has been encountered. It is actually a pessimization (by
4413 a significant factor) to use this loop on text with many non-ASCII
4414 characters, and it is important to avoid bad performance on valid
4415 utf-8 data (invalid utf-8 being a different can of worms).
4416 */
4417
4418 /* ASCII */
4419 for (; p < end; ++p) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004420 /* Only check value if it's not a ASCII char... */
4421 if (*p < 0x80) {
4422 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4423 an explanation. */
4424 if (!((size_t) p & LONG_PTR_MASK)) {
4425 /* Help register allocation */
4426 register const unsigned char *_p = p;
4427 while (_p < aligned_end) {
4428 unsigned long value = *(unsigned long *) _p;
4429 if (value & ASCII_CHAR_MASK)
4430 break;
4431 _p += SIZEOF_LONG;
4432 char_count += SIZEOF_LONG;
4433 }
4434 p = _p;
4435 if (p == end)
4436 break;
4437 }
4438 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004439 if (*p < 0x80)
4440 ++char_count;
4441 else
4442 goto _ucs1loop;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004443 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004444 *unicode_size = char_count;
4445 return 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004446
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004447_ucs1loop:
4448 for (; p < end; ++p) {
4449 if (*p < 0xc4)
4450 char_count += ((*p & 0xc0) != 0x80);
4451 else
4452 goto _ucs2loop;
4453 }
4454 *unicode_size = char_count;
4455 return 255;
4456
4457_ucs2loop:
4458 for (; p < end; ++p) {
4459 if (*p < 0xf0)
4460 char_count += ((*p & 0xc0) != 0x80);
4461 else
4462 goto _ucs4loop;
4463 }
4464 *unicode_size = char_count;
4465 return 65535;
4466
4467_ucs4loop:
4468 for (; p < end; ++p) {
4469 char_count += ((*p & 0xc0) != 0x80);
4470 }
4471 *unicode_size = char_count;
4472 return 65537;
4473}
4474
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004475/* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
Victor Stinner785938e2011-12-11 20:09:03 +01004476 in case of errors. Implicit parameters: unicode, kind, data, onError.
4477 Potential resizing overallocates, so the result needs to shrink at the end.
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004478*/
Victor Stinner785938e2011-12-11 20:09:03 +01004479#define WRITE_MAYBE_FAIL(index, value) \
4480 do { \
4481 Py_ssize_t pos = index; \
4482 if (pos > PyUnicode_GET_LENGTH(unicode) && \
4483 unicode_resize(&unicode, pos + pos/8) < 0) \
4484 goto onError; \
4485 if (unicode_putchar(&unicode, &pos, value) < 0) \
4486 goto onError; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004487 } while (0)
4488
Victor Stinnerbf6e5602011-12-12 01:53:47 +01004489static PyObject *
Victor Stinner785938e2011-12-11 20:09:03 +01004490decode_utf8_errors(const char *starts,
4491 Py_ssize_t size,
4492 const char *errors,
4493 Py_ssize_t *consumed,
4494 const char *s,
4495 PyObject *unicode,
4496 Py_ssize_t i)
Walter Dörwald69652032004-09-07 20:24:22 +00004497{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004498 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004499 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004500 Py_ssize_t startinpos;
4501 Py_ssize_t endinpos;
Victor Stinner785938e2011-12-11 20:09:03 +01004502 const char *e = starts + size;
4503 const char *aligned_end;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004504 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004505 PyObject *errorHandler = NULL;
4506 PyObject *exc = NULL;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004507
Antoine Pitrouab868312009-01-10 15:40:25 +00004508 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004509
4510 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004511 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004512
4513 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004514 /* Fast path for runs of ASCII characters. Given that common UTF-8
4515 input will consist of an overwhelming majority of ASCII
4516 characters, we try to optimize for this case by checking
4517 as many characters as a C 'long' can contain.
4518 First, check if we can do an aligned read, as most CPUs have
4519 a penalty for unaligned reads.
4520 */
4521 if (!((size_t) s & LONG_PTR_MASK)) {
4522 /* Help register allocation */
4523 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004524 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004525 while (_s < aligned_end) {
4526 /* Read a whole long at a time (either 4 or 8 bytes),
4527 and do a fast unrolled copy if it only contains ASCII
4528 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004529 unsigned long value = *(unsigned long *) _s;
4530 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004531 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004532 WRITE_MAYBE_FAIL(_i+0, _s[0]);
4533 WRITE_MAYBE_FAIL(_i+1, _s[1]);
4534 WRITE_MAYBE_FAIL(_i+2, _s[2]);
4535 WRITE_MAYBE_FAIL(_i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004536#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004537 WRITE_MAYBE_FAIL(_i+4, _s[4]);
4538 WRITE_MAYBE_FAIL(_i+5, _s[5]);
4539 WRITE_MAYBE_FAIL(_i+6, _s[6]);
4540 WRITE_MAYBE_FAIL(_i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004541#endif
4542 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004543 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004544 }
4545 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004546 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004547 if (s == e)
4548 break;
4549 ch = (unsigned char)*s;
4550 }
4551 }
4552
4553 if (ch < 0x80) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004554 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004555 s++;
4556 continue;
4557 }
4558
4559 n = utf8_code_length[ch];
4560
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004561 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004562 if (consumed)
4563 break;
4564 else {
4565 errmsg = "unexpected end of data";
4566 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004567 endinpos = startinpos+1;
4568 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4569 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004570 goto utf8Error;
4571 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004572 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004573
4574 switch (n) {
4575
4576 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004577 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004578 startinpos = s-starts;
4579 endinpos = startinpos+1;
4580 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004581
4582 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004583 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004584 startinpos = s-starts;
4585 endinpos = startinpos+1;
4586 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004587
4588 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004589 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004590 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004591 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004592 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004593 goto utf8Error;
4594 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004595 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004596 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004597 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004598 break;
4599
4600 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004601 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4602 will result in surrogates in range d800-dfff. Surrogates are
4603 not valid UTF-8 so they are rejected.
4604 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4605 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004606 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004607 (s[2] & 0xc0) != 0x80 ||
4608 ((unsigned char)s[0] == 0xE0 &&
4609 (unsigned char)s[1] < 0xA0) ||
4610 ((unsigned char)s[0] == 0xED &&
4611 (unsigned char)s[1] > 0x9F)) {
4612 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004613 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004614 endinpos = startinpos + 1;
4615
4616 /* if s[1] first two bits are 1 and 0, then the invalid
4617 continuation byte is s[2], so increment endinpos by 1,
4618 if not, s[1] is invalid and endinpos doesn't need to
4619 be incremented. */
4620 if ((s[1] & 0xC0) == 0x80)
4621 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004622 goto utf8Error;
4623 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004624 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004625 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004626 WRITE_MAYBE_FAIL(i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004627 break;
4628
4629 case 4:
4630 if ((s[1] & 0xc0) != 0x80 ||
4631 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004632 (s[3] & 0xc0) != 0x80 ||
4633 ((unsigned char)s[0] == 0xF0 &&
4634 (unsigned char)s[1] < 0x90) ||
4635 ((unsigned char)s[0] == 0xF4 &&
4636 (unsigned char)s[1] > 0x8F)) {
4637 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004638 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004639 endinpos = startinpos + 1;
4640 if ((s[1] & 0xC0) == 0x80) {
4641 endinpos++;
4642 if ((s[2] & 0xC0) == 0x80)
4643 endinpos++;
4644 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004645 goto utf8Error;
4646 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004647 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004648 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01004649 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Ezio Melotti57221d02010-07-01 07:32:02 +00004650
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004651 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004652 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004653 }
4654 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004655 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004656
Benjamin Peterson29060642009-01-31 22:14:21 +00004657 utf8Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00004658 if (unicode_decode_call_errorhandler(
4659 errors, &errorHandler,
4660 "utf8", errmsg,
4661 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004662 &unicode, &i))
Benjamin Peterson29060642009-01-31 22:14:21 +00004663 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004664 /* Update data because unicode_decode_call_errorhandler might have
4665 re-created or resized the unicode object. */
Benjamin Peterson29060642009-01-31 22:14:21 +00004666 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004667 }
Walter Dörwald69652032004-09-07 20:24:22 +00004668 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004669 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004670
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004671 /* Adjust length and ready string when it contained errors and
4672 is of the old resizable kind. */
Victor Stinner785938e2011-12-11 20:09:03 +01004673 if (unicode_resize(&unicode, i) < 0)
4674 goto onError;
4675 unicode_adjust_maxchar(&unicode);
4676 if (unicode == NULL)
4677 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004678
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004679 Py_XDECREF(errorHandler);
4680 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004681 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01004682 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004683
Benjamin Peterson29060642009-01-31 22:14:21 +00004684 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004685 Py_XDECREF(errorHandler);
4686 Py_XDECREF(exc);
Victor Stinner785938e2011-12-11 20:09:03 +01004687 Py_XDECREF(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004688 return NULL;
4689}
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004690#undef WRITE_MAYBE_FAIL
Antoine Pitrouab868312009-01-10 15:40:25 +00004691
Victor Stinner785938e2011-12-11 20:09:03 +01004692PyObject *
4693PyUnicode_DecodeUTF8Stateful(const char *s,
4694 Py_ssize_t size,
4695 const char *errors,
4696 Py_ssize_t *consumed)
4697{
4698 Py_UCS4 maxchar = 0;
4699 Py_ssize_t unicode_size;
4700 int has_errors = 0;
4701 PyObject *unicode;
4702 int kind;
4703 void *data;
4704 const char *starts = s;
4705 const char *e;
4706 Py_ssize_t i;
4707
4708 if (size == 0) {
4709 if (consumed)
4710 *consumed = 0;
Victor Stinner382955f2011-12-11 21:44:00 +01004711 Py_INCREF(unicode_empty);
4712 return unicode_empty;
Victor Stinner785938e2011-12-11 20:09:03 +01004713 }
4714
Victor Stinnera1d12bb2011-12-11 21:53:09 +01004715 maxchar = utf8_scanner((const unsigned char *)s, size, &unicode_size);
Victor Stinner785938e2011-12-11 20:09:03 +01004716
4717 /* When the string is ASCII only, just use memcpy and return.
4718 unicode_size may be != size if there is an incomplete UTF-8
4719 sequence at the end of the ASCII block. */
4720 if (maxchar < 128 && size == unicode_size) {
4721 if (consumed)
4722 *consumed = size;
4723 return unicode_fromascii(s, size);
4724 }
4725
4726 unicode = PyUnicode_New(unicode_size, maxchar);
4727 if (!unicode)
4728 return NULL;
4729 kind = PyUnicode_KIND(unicode);
4730 data = PyUnicode_DATA(unicode);
4731
4732 /* Unpack UTF-8 encoded data */
4733 i = 0;
4734 e = starts + size;
4735 switch (kind) {
4736 case PyUnicode_1BYTE_KIND:
4737 has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
4738 break;
4739 case PyUnicode_2BYTE_KIND:
4740 has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
4741 break;
4742 case PyUnicode_4BYTE_KIND:
4743 has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
4744 break;
4745 }
4746 if (!has_errors) {
4747 /* Ensure the unicode size calculation was correct */
4748 assert(i == unicode_size);
4749 assert(s == e);
4750 if (consumed)
4751 *consumed = size;
4752 return unicode;
4753 }
4754
4755 /* In case of errors, maxchar and size computation might be incorrect;
4756 code below refits and resizes as necessary. */
4757 return decode_utf8_errors(starts, size, errors, consumed, s, unicode, i);
4758}
4759
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004760#ifdef __APPLE__
4761
4762/* Simplified UTF-8 decoder using surrogateescape error handler,
4763 used to decode the command line arguments on Mac OS X. */
4764
4765wchar_t*
4766_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4767{
4768 int n;
4769 const char *e;
4770 wchar_t *unicode, *p;
4771
4772 /* Note: size will always be longer than the resulting Unicode
4773 character count */
4774 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4775 PyErr_NoMemory();
4776 return NULL;
4777 }
4778 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4779 if (!unicode)
4780 return NULL;
4781
4782 /* Unpack UTF-8 encoded data */
4783 p = unicode;
4784 e = s + size;
4785 while (s < e) {
4786 Py_UCS4 ch = (unsigned char)*s;
4787
4788 if (ch < 0x80) {
4789 *p++ = (wchar_t)ch;
4790 s++;
4791 continue;
4792 }
4793
4794 n = utf8_code_length[ch];
4795 if (s + n > e) {
4796 goto surrogateescape;
4797 }
4798
4799 switch (n) {
4800 case 0:
4801 case 1:
4802 goto surrogateescape;
4803
4804 case 2:
4805 if ((s[1] & 0xc0) != 0x80)
4806 goto surrogateescape;
4807 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4808 assert ((ch > 0x007F) && (ch <= 0x07FF));
4809 *p++ = (wchar_t)ch;
4810 break;
4811
4812 case 3:
4813 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4814 will result in surrogates in range d800-dfff. Surrogates are
4815 not valid UTF-8 so they are rejected.
4816 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4817 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4818 if ((s[1] & 0xc0) != 0x80 ||
4819 (s[2] & 0xc0) != 0x80 ||
4820 ((unsigned char)s[0] == 0xE0 &&
4821 (unsigned char)s[1] < 0xA0) ||
4822 ((unsigned char)s[0] == 0xED &&
4823 (unsigned char)s[1] > 0x9F)) {
4824
4825 goto surrogateescape;
4826 }
4827 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4828 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004829 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004830 break;
4831
4832 case 4:
4833 if ((s[1] & 0xc0) != 0x80 ||
4834 (s[2] & 0xc0) != 0x80 ||
4835 (s[3] & 0xc0) != 0x80 ||
4836 ((unsigned char)s[0] == 0xF0 &&
4837 (unsigned char)s[1] < 0x90) ||
4838 ((unsigned char)s[0] == 0xF4 &&
4839 (unsigned char)s[1] > 0x8F)) {
4840 goto surrogateescape;
4841 }
4842 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4843 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01004844 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004845
4846#if SIZEOF_WCHAR_T == 4
4847 *p++ = (wchar_t)ch;
4848#else
4849 /* compute and append the two surrogates: */
Victor Stinner551ac952011-11-29 22:58:13 +01004850 *p++ = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4851 *p++ = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004852#endif
4853 break;
4854 }
4855 s += n;
4856 continue;
4857
4858 surrogateescape:
4859 *p++ = 0xDC00 + ch;
4860 s++;
4861 }
4862 *p = L'\0';
4863 return unicode;
4864}
4865
4866#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004867
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004868/* Primary internal function which creates utf8 encoded bytes objects.
4869
4870 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004871 and allocate exactly as much space needed at the end. Else allocate the
4872 maximum possible needed (4 result bytes per Unicode character), and return
4873 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004874*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004875PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004876_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004877{
Tim Peters602f7402002-04-27 18:03:26 +00004878#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004879
Guido van Rossum98297ee2007-11-06 21:34:58 +00004880 Py_ssize_t i; /* index into s of next input byte */
4881 PyObject *result; /* result string object */
4882 char *p; /* next free byte in output buffer */
4883 Py_ssize_t nallocated; /* number of result bytes allocated */
4884 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004885 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004886 PyObject *errorHandler = NULL;
4887 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004888 int kind;
4889 void *data;
4890 Py_ssize_t size;
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004891 PyObject *rep = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004892
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004893 if (!PyUnicode_Check(unicode)) {
4894 PyErr_BadArgument();
4895 return NULL;
4896 }
4897
4898 if (PyUnicode_READY(unicode) == -1)
4899 return NULL;
4900
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004901 if (PyUnicode_UTF8(unicode))
4902 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4903 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004904
4905 kind = PyUnicode_KIND(unicode);
4906 data = PyUnicode_DATA(unicode);
4907 size = PyUnicode_GET_LENGTH(unicode);
4908
Tim Peters602f7402002-04-27 18:03:26 +00004909 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004910
Tim Peters602f7402002-04-27 18:03:26 +00004911 if (size <= MAX_SHORT_UNICHARS) {
4912 /* Write into the stack buffer; nallocated can't overflow.
4913 * At the end, we'll allocate exactly as much heap space as it
4914 * turns out we need.
4915 */
4916 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004917 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004918 p = stackbuf;
4919 }
4920 else {
4921 /* Overallocate on the heap, and give the excess back at the end. */
4922 nallocated = size * 4;
4923 if (nallocated / 4 != size) /* overflow! */
4924 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004925 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004926 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004927 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004928 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004929 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004930
Tim Peters602f7402002-04-27 18:03:26 +00004931 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004932 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004933
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004934 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004935 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004936 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004937
Guido van Rossumd57fd912000-03-10 22:53:23 +00004938 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004939 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004940 *p++ = (char)(0xc0 | (ch >> 6));
4941 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner551ac952011-11-29 22:58:13 +01004942 } else if (Py_UNICODE_IS_SURROGATE(ch)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004943 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004944 Py_ssize_t repsize, k, startpos;
4945 startpos = i-1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004946 rep = unicode_encode_call_errorhandler(
4947 errors, &errorHandler, "utf-8", "surrogates not allowed",
Victor Stinner7931d9a2011-11-04 00:22:48 +01004948 unicode, &exc, startpos, startpos+1, &newpos);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004949 if (!rep)
4950 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004951
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004952 if (PyBytes_Check(rep))
4953 repsize = PyBytes_GET_SIZE(rep);
4954 else
Victor Stinner9e30aa52011-11-21 02:49:52 +01004955 repsize = PyUnicode_GET_LENGTH(rep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004956
4957 if (repsize > 4) {
4958 Py_ssize_t offset;
4959
4960 if (result == NULL)
4961 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004962 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004963 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004964
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004965 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4966 /* integer overflow */
4967 PyErr_NoMemory();
4968 goto error;
4969 }
4970 nallocated += repsize - 4;
4971 if (result != NULL) {
4972 if (_PyBytes_Resize(&result, nallocated) < 0)
4973 goto error;
4974 } else {
4975 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004976 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004977 goto error;
4978 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4979 }
4980 p = PyBytes_AS_STRING(result) + offset;
4981 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004982
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004983 if (PyBytes_Check(rep)) {
4984 char *prep = PyBytes_AS_STRING(rep);
4985 for(k = repsize; k > 0; k--)
4986 *p++ = *prep++;
4987 } else /* rep is unicode */ {
Victor Stinnera98b28c2011-11-10 20:21:49 +01004988 enum PyUnicode_Kind repkind;
4989 void *repdata;
4990
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004991 if (PyUnicode_READY(rep) < 0)
Victor Stinnera98b28c2011-11-10 20:21:49 +01004992 goto error;
Victor Stinnera98b28c2011-11-10 20:21:49 +01004993 repkind = PyUnicode_KIND(rep);
4994 repdata = PyUnicode_DATA(rep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004995
4996 for(k=0; k<repsize; k++) {
Victor Stinnera98b28c2011-11-10 20:21:49 +01004997 Py_UCS4 c = PyUnicode_READ(repkind, repdata, k);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004998 if (0x80 <= c) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01004999 raise_encode_exception(&exc, "utf-8",
Victor Stinner7931d9a2011-11-04 00:22:48 +01005000 unicode,
Martin v. Löwis9e816682011-11-02 12:45:42 +01005001 i-1, i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005002 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00005003 goto error;
5004 }
Victor Stinnera98b28c2011-11-10 20:21:49 +01005005 *p++ = (char)c;
Victor Stinner31be90b2010-04-22 19:38:16 +00005006 }
Victor Stinner31be90b2010-04-22 19:38:16 +00005007 }
Antoine Pitrou31b92a52011-11-12 18:35:19 +01005008 Py_CLEAR(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00005009 } else if (ch < 0x10000) {
5010 *p++ = (char)(0xe0 | (ch >> 12));
5011 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5012 *p++ = (char)(0x80 | (ch & 0x3f));
5013 } else /* ch >= 0x10000 */ {
Victor Stinner8faf8212011-12-08 22:14:11 +01005014 assert(ch <= MAX_UNICODE);
Tim Peters602f7402002-04-27 18:03:26 +00005015 /* Encode UCS4 Unicode ordinals */
5016 *p++ = (char)(0xf0 | (ch >> 18));
5017 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5018 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5019 *p++ = (char)(0x80 | (ch & 0x3f));
5020 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005021 }
Tim Peters0eca65c2002-04-21 17:28:06 +00005022
Guido van Rossum98297ee2007-11-06 21:34:58 +00005023 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00005024 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005025 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00005026 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00005027 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00005028 }
5029 else {
Christian Heimesf3863112007-11-22 07:46:41 +00005030 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00005031 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00005032 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00005033 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00005034 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005035
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005036 Py_XDECREF(errorHandler);
5037 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005038 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005039 error:
Antoine Pitrou31b92a52011-11-12 18:35:19 +01005040 Py_XDECREF(rep);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005041 Py_XDECREF(errorHandler);
5042 Py_XDECREF(exc);
5043 Py_XDECREF(result);
5044 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005045
Tim Peters602f7402002-04-27 18:03:26 +00005046#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00005047}
5048
Alexander Belopolsky40018472011-02-26 01:02:56 +00005049PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005050PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5051 Py_ssize_t size,
5052 const char *errors)
5053{
5054 PyObject *v, *unicode;
5055
5056 unicode = PyUnicode_FromUnicode(s, size);
5057 if (unicode == NULL)
5058 return NULL;
5059 v = _PyUnicode_AsUTF8String(unicode, errors);
5060 Py_DECREF(unicode);
5061 return v;
5062}
5063
5064PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005065PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005066{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005067 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005068}
5069
Walter Dörwald41980ca2007-08-16 21:55:45 +00005070/* --- UTF-32 Codec ------------------------------------------------------- */
5071
5072PyObject *
5073PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005074 Py_ssize_t size,
5075 const char *errors,
5076 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005077{
5078 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5079}
5080
5081PyObject *
5082PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005083 Py_ssize_t size,
5084 const char *errors,
5085 int *byteorder,
5086 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005087{
5088 const char *starts = s;
5089 Py_ssize_t startinpos;
5090 Py_ssize_t endinpos;
5091 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005092 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005093 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005094 int bo = 0; /* assume native ordering by default */
5095 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005096 /* Offsets from q for retrieving bytes in the right order. */
5097#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5098 int iorder[] = {0, 1, 2, 3};
5099#else
5100 int iorder[] = {3, 2, 1, 0};
5101#endif
5102 PyObject *errorHandler = NULL;
5103 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005104
Walter Dörwald41980ca2007-08-16 21:55:45 +00005105 q = (unsigned char *)s;
5106 e = q + size;
5107
5108 if (byteorder)
5109 bo = *byteorder;
5110
5111 /* Check for BOM marks (U+FEFF) in the input and adjust current
5112 byte order setting accordingly. In native mode, the leading BOM
5113 mark is skipped, in all other modes, it is copied to the output
5114 stream as-is (giving a ZWNBSP character). */
5115 if (bo == 0) {
5116 if (size >= 4) {
5117 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00005118 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005119#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005120 if (bom == 0x0000FEFF) {
5121 q += 4;
5122 bo = -1;
5123 }
5124 else if (bom == 0xFFFE0000) {
5125 q += 4;
5126 bo = 1;
5127 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005128#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005129 if (bom == 0x0000FEFF) {
5130 q += 4;
5131 bo = 1;
5132 }
5133 else if (bom == 0xFFFE0000) {
5134 q += 4;
5135 bo = -1;
5136 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005137#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005138 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005139 }
5140
5141 if (bo == -1) {
5142 /* force LE */
5143 iorder[0] = 0;
5144 iorder[1] = 1;
5145 iorder[2] = 2;
5146 iorder[3] = 3;
5147 }
5148 else if (bo == 1) {
5149 /* force BE */
5150 iorder[0] = 3;
5151 iorder[1] = 2;
5152 iorder[2] = 1;
5153 iorder[3] = 0;
5154 }
5155
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005156 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005157 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005158 if (!unicode)
5159 return NULL;
5160 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005161 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005162 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005163
Walter Dörwald41980ca2007-08-16 21:55:45 +00005164 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005165 Py_UCS4 ch;
5166 /* remaining bytes at the end? (size should be divisible by 4) */
5167 if (e-q<4) {
5168 if (consumed)
5169 break;
5170 errmsg = "truncated data";
5171 startinpos = ((const char *)q)-starts;
5172 endinpos = ((const char *)e)-starts;
5173 goto utf32Error;
5174 /* The remaining input chars are ignored if the callback
5175 chooses to skip the input */
5176 }
5177 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5178 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005179
Benjamin Peterson29060642009-01-31 22:14:21 +00005180 if (ch >= 0x110000)
5181 {
5182 errmsg = "codepoint not in range(0x110000)";
5183 startinpos = ((const char *)q)-starts;
5184 endinpos = startinpos+4;
5185 goto utf32Error;
5186 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005187 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5188 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005189 q += 4;
5190 continue;
5191 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005192 if (unicode_decode_call_errorhandler(
5193 errors, &errorHandler,
5194 "utf32", errmsg,
5195 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005196 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005197 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005198 }
5199
5200 if (byteorder)
5201 *byteorder = bo;
5202
5203 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005204 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005205
5206 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005207 if (unicode_resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005208 goto onError;
5209
5210 Py_XDECREF(errorHandler);
5211 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005212 return unicode_result(unicode);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005213
Benjamin Peterson29060642009-01-31 22:14:21 +00005214 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005215 Py_DECREF(unicode);
5216 Py_XDECREF(errorHandler);
5217 Py_XDECREF(exc);
5218 return NULL;
5219}
5220
5221PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005222_PyUnicode_EncodeUTF32(PyObject *str,
5223 const char *errors,
5224 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005225{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005226 int kind;
5227 void *data;
5228 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005229 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005230 unsigned char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005231 Py_ssize_t nsize, bytesize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005232 /* Offsets from p for storing byte pairs in the right order. */
5233#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5234 int iorder[] = {0, 1, 2, 3};
5235#else
5236 int iorder[] = {3, 2, 1, 0};
5237#endif
5238
Benjamin Peterson29060642009-01-31 22:14:21 +00005239#define STORECHAR(CH) \
5240 do { \
5241 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5242 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5243 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5244 p[iorder[0]] = (CH) & 0xff; \
5245 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005246 } while(0)
5247
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005248 if (!PyUnicode_Check(str)) {
5249 PyErr_BadArgument();
5250 return NULL;
5251 }
5252 if (PyUnicode_READY(str) < 0)
5253 return NULL;
5254 kind = PyUnicode_KIND(str);
5255 data = PyUnicode_DATA(str);
5256 len = PyUnicode_GET_LENGTH(str);
5257
5258 nsize = len + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005259 bytesize = nsize * 4;
5260 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005261 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005262 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005263 if (v == NULL)
5264 return NULL;
5265
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005266 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005267 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005268 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005269 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005270 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005271
5272 if (byteorder == -1) {
5273 /* force LE */
5274 iorder[0] = 0;
5275 iorder[1] = 1;
5276 iorder[2] = 2;
5277 iorder[3] = 3;
5278 }
5279 else if (byteorder == 1) {
5280 /* force BE */
5281 iorder[0] = 3;
5282 iorder[1] = 2;
5283 iorder[2] = 1;
5284 iorder[3] = 0;
5285 }
5286
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005287 for (i = 0; i < len; i++)
5288 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005289
5290 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005291 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005292#undef STORECHAR
5293}
5294
Alexander Belopolsky40018472011-02-26 01:02:56 +00005295PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005296PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5297 Py_ssize_t size,
5298 const char *errors,
5299 int byteorder)
5300{
5301 PyObject *result;
5302 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5303 if (tmp == NULL)
5304 return NULL;
5305 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5306 Py_DECREF(tmp);
5307 return result;
5308}
5309
5310PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005311PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005312{
Victor Stinnerb960b342011-11-20 19:12:52 +01005313 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005314}
5315
Guido van Rossumd57fd912000-03-10 22:53:23 +00005316/* --- UTF-16 Codec ------------------------------------------------------- */
5317
Tim Peters772747b2001-08-09 22:21:55 +00005318PyObject *
5319PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005320 Py_ssize_t size,
5321 const char *errors,
5322 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005323{
Walter Dörwald69652032004-09-07 20:24:22 +00005324 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5325}
5326
Antoine Pitrouab868312009-01-10 15:40:25 +00005327/* Two masks for fast checking of whether a C 'long' may contain
5328 UTF16-encoded surrogate characters. This is an efficient heuristic,
5329 assuming that non-surrogate characters with a code point >= 0x8000 are
5330 rare in most input.
5331 FAST_CHAR_MASK is used when the input is in native byte ordering,
5332 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005333*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005334#if (SIZEOF_LONG == 8)
5335# define FAST_CHAR_MASK 0x8000800080008000L
5336# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
5337#elif (SIZEOF_LONG == 4)
5338# define FAST_CHAR_MASK 0x80008000L
5339# define SWAPPED_FAST_CHAR_MASK 0x00800080L
5340#else
5341# error C 'long' size should be either 4 or 8!
5342#endif
5343
Walter Dörwald69652032004-09-07 20:24:22 +00005344PyObject *
5345PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005346 Py_ssize_t size,
5347 const char *errors,
5348 int *byteorder,
5349 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005350{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005351 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005352 Py_ssize_t startinpos;
5353 Py_ssize_t endinpos;
5354 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005355 PyObject *unicode;
Antoine Pitrouab868312009-01-10 15:40:25 +00005356 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005357 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005358 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005359 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005360 /* Offsets from q for retrieving byte pairs in the right order. */
5361#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5362 int ihi = 1, ilo = 0;
5363#else
5364 int ihi = 0, ilo = 1;
5365#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005366 PyObject *errorHandler = NULL;
5367 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005368
5369 /* Note: size will always be longer than the resulting Unicode
5370 character count */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005371 unicode = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005372 if (!unicode)
5373 return NULL;
5374 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005375 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005376 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005377
Tim Peters772747b2001-08-09 22:21:55 +00005378 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005379 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005380
5381 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005382 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005383
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005384 /* Check for BOM marks (U+FEFF) in the input and adjust current
5385 byte order setting accordingly. In native mode, the leading BOM
5386 mark is skipped, in all other modes, it is copied to the output
5387 stream as-is (giving a ZWNBSP character). */
5388 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005389 if (size >= 2) {
Victor Stinner24729f32011-11-10 20:31:37 +01005390 const Py_UCS4 bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005391#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005392 if (bom == 0xFEFF) {
5393 q += 2;
5394 bo = -1;
5395 }
5396 else if (bom == 0xFFFE) {
5397 q += 2;
5398 bo = 1;
5399 }
Tim Petersced69f82003-09-16 20:30:58 +00005400#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005401 if (bom == 0xFEFF) {
5402 q += 2;
5403 bo = 1;
5404 }
5405 else if (bom == 0xFFFE) {
5406 q += 2;
5407 bo = -1;
5408 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005409#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005410 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005411 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005412
Tim Peters772747b2001-08-09 22:21:55 +00005413 if (bo == -1) {
5414 /* force LE */
5415 ihi = 1;
5416 ilo = 0;
5417 }
5418 else if (bo == 1) {
5419 /* force BE */
5420 ihi = 0;
5421 ilo = 1;
5422 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005423#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5424 native_ordering = ilo < ihi;
5425#else
5426 native_ordering = ilo > ihi;
5427#endif
Tim Peters772747b2001-08-09 22:21:55 +00005428
Antoine Pitrouab868312009-01-10 15:40:25 +00005429 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005430 while (q < e) {
Victor Stinner24729f32011-11-10 20:31:37 +01005431 Py_UCS4 ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005432 /* First check for possible aligned read of a C 'long'. Unaligned
5433 reads are more expensive, better to defer to another iteration. */
5434 if (!((size_t) q & LONG_PTR_MASK)) {
5435 /* Fast path for runs of non-surrogate chars. */
5436 register const unsigned char *_q = q;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005437 int kind = PyUnicode_KIND(unicode);
5438 void *data = PyUnicode_DATA(unicode);
5439 while (_q < aligned_end) {
5440 unsigned long block = * (unsigned long *) _q;
5441 unsigned short *pblock = (unsigned short*)&block;
5442 Py_UCS4 maxch;
5443 if (native_ordering) {
5444 /* Can use buffer directly */
5445 if (block & FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005446 break;
Antoine Pitrouab868312009-01-10 15:40:25 +00005447 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005448 else {
5449 /* Need to byte-swap */
5450 unsigned char *_p = (unsigned char*)pblock;
5451 if (block & SWAPPED_FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005452 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005453 _p[0] = _q[1];
5454 _p[1] = _q[0];
5455 _p[2] = _q[3];
5456 _p[3] = _q[2];
Antoine Pitrouab868312009-01-10 15:40:25 +00005457#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005458 _p[4] = _q[5];
5459 _p[5] = _q[4];
5460 _p[6] = _q[7];
5461 _p[7] = _q[6];
Antoine Pitrouab868312009-01-10 15:40:25 +00005462#endif
Antoine Pitrouab868312009-01-10 15:40:25 +00005463 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005464 maxch = Py_MAX(pblock[0], pblock[1]);
5465#if SIZEOF_LONG == 8
5466 maxch = Py_MAX(maxch, Py_MAX(pblock[2], pblock[3]));
5467#endif
5468 if (maxch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
5469 if (unicode_widen(&unicode, maxch) < 0)
5470 goto onError;
5471 kind = PyUnicode_KIND(unicode);
5472 data = PyUnicode_DATA(unicode);
5473 }
5474 PyUnicode_WRITE(kind, data, outpos++, pblock[0]);
5475 PyUnicode_WRITE(kind, data, outpos++, pblock[1]);
5476#if SIZEOF_LONG == 8
5477 PyUnicode_WRITE(kind, data, outpos++, pblock[2]);
5478 PyUnicode_WRITE(kind, data, outpos++, pblock[3]);
5479#endif
5480 _q += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00005481 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005482 q = _q;
5483 if (q >= e)
5484 break;
5485 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005486 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005487
Benjamin Peterson14339b62009-01-31 16:36:08 +00005488 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005489
Victor Stinner551ac952011-11-29 22:58:13 +01005490 if (!Py_UNICODE_IS_SURROGATE(ch)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005491 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5492 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005493 continue;
5494 }
5495
5496 /* UTF-16 code pair: */
5497 if (q > e) {
5498 errmsg = "unexpected end of data";
5499 startinpos = (((const char *)q) - 2) - starts;
5500 endinpos = ((const char *)e) + 1 - starts;
5501 goto utf16Error;
5502 }
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005503 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)) {
5504 Py_UCS4 ch2 = (q[ihi] << 8) | q[ilo];
Benjamin Peterson29060642009-01-31 22:14:21 +00005505 q += 2;
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005506 if (Py_UNICODE_IS_LOW_SURROGATE(ch2)) {
Victor Stinner62aa4d02011-11-09 00:03:45 +01005507 if (unicode_putchar(&unicode, &outpos,
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005508 Py_UNICODE_JOIN_SURROGATES(ch, ch2)) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005509 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005510 continue;
5511 }
5512 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005513 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005514 startinpos = (((const char *)q)-4)-starts;
5515 endinpos = startinpos+2;
5516 goto utf16Error;
5517 }
5518
Benjamin Peterson14339b62009-01-31 16:36:08 +00005519 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005520 errmsg = "illegal encoding";
5521 startinpos = (((const char *)q)-2)-starts;
5522 endinpos = startinpos+2;
5523 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005524
Benjamin Peterson29060642009-01-31 22:14:21 +00005525 utf16Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005526 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005527 errors,
5528 &errorHandler,
5529 "utf16", errmsg,
5530 &starts,
5531 (const char **)&e,
5532 &startinpos,
5533 &endinpos,
5534 &exc,
5535 (const char **)&q,
5536 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005537 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005538 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005539 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005540 /* remaining byte at the end? (size should be even) */
5541 if (e == q) {
5542 if (!consumed) {
5543 errmsg = "truncated data";
5544 startinpos = ((const char *)q) - starts;
5545 endinpos = ((const char *)e) + 1 - starts;
Antoine Pitrouab868312009-01-10 15:40:25 +00005546 if (unicode_decode_call_errorhandler(
5547 errors,
5548 &errorHandler,
5549 "utf16", errmsg,
5550 &starts,
5551 (const char **)&e,
5552 &startinpos,
5553 &endinpos,
5554 &exc,
5555 (const char **)&q,
5556 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005557 &outpos))
Antoine Pitrouab868312009-01-10 15:40:25 +00005558 goto onError;
5559 /* The remaining input chars are ignored if the callback
5560 chooses to skip the input */
5561 }
5562 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005563
5564 if (byteorder)
5565 *byteorder = bo;
5566
Walter Dörwald69652032004-09-07 20:24:22 +00005567 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005568 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005569
Guido van Rossumd57fd912000-03-10 22:53:23 +00005570 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005571 if (unicode_resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005572 goto onError;
5573
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005574 Py_XDECREF(errorHandler);
5575 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005576 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005577
Benjamin Peterson29060642009-01-31 22:14:21 +00005578 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005579 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005580 Py_XDECREF(errorHandler);
5581 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005582 return NULL;
5583}
5584
Antoine Pitrouab868312009-01-10 15:40:25 +00005585#undef FAST_CHAR_MASK
5586#undef SWAPPED_FAST_CHAR_MASK
5587
Tim Peters772747b2001-08-09 22:21:55 +00005588PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005589_PyUnicode_EncodeUTF16(PyObject *str,
5590 const char *errors,
5591 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005592{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005593 int kind;
5594 void *data;
5595 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005596 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005597 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005598 Py_ssize_t nsize, bytesize;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005599 Py_ssize_t i, pairs;
Tim Peters772747b2001-08-09 22:21:55 +00005600 /* Offsets from p for storing byte pairs in the right order. */
5601#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5602 int ihi = 1, ilo = 0;
5603#else
5604 int ihi = 0, ilo = 1;
5605#endif
5606
Benjamin Peterson29060642009-01-31 22:14:21 +00005607#define STORECHAR(CH) \
5608 do { \
5609 p[ihi] = ((CH) >> 8) & 0xff; \
5610 p[ilo] = (CH) & 0xff; \
5611 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005612 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005613
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005614 if (!PyUnicode_Check(str)) {
5615 PyErr_BadArgument();
5616 return NULL;
5617 }
5618 if (PyUnicode_READY(str) < 0)
5619 return NULL;
5620 kind = PyUnicode_KIND(str);
5621 data = PyUnicode_DATA(str);
5622 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005623
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005624 pairs = 0;
5625 if (kind == PyUnicode_4BYTE_KIND)
5626 for (i = 0; i < len; i++)
5627 if (PyUnicode_READ(kind, data, i) >= 0x10000)
5628 pairs++;
5629 /* 2 * (len + pairs + (byteorder == 0)) */
5630 if (len > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005631 return PyErr_NoMemory();
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005632 nsize = len + pairs + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005633 bytesize = nsize * 2;
5634 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005635 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005636 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005637 if (v == NULL)
5638 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005639
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005640 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005641 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005642 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005643 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005644 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005645
5646 if (byteorder == -1) {
5647 /* force LE */
5648 ihi = 1;
5649 ilo = 0;
5650 }
5651 else if (byteorder == 1) {
5652 /* force BE */
5653 ihi = 0;
5654 ilo = 1;
5655 }
5656
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005657 for (i = 0; i < len; i++) {
5658 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5659 Py_UCS4 ch2 = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +00005660 if (ch >= 0x10000) {
Victor Stinner551ac952011-11-29 22:58:13 +01005661 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
5662 ch = Py_UNICODE_HIGH_SURROGATE(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00005663 }
Tim Peters772747b2001-08-09 22:21:55 +00005664 STORECHAR(ch);
5665 if (ch2)
5666 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005667 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005668
5669 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005670 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005671#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005672}
5673
Alexander Belopolsky40018472011-02-26 01:02:56 +00005674PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005675PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5676 Py_ssize_t size,
5677 const char *errors,
5678 int byteorder)
5679{
5680 PyObject *result;
5681 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5682 if (tmp == NULL)
5683 return NULL;
5684 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5685 Py_DECREF(tmp);
5686 return result;
5687}
5688
5689PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005690PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005691{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005692 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005693}
5694
5695/* --- Unicode Escape Codec ----------------------------------------------- */
5696
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005697/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5698 if all the escapes in the string make it still a valid ASCII string.
5699 Returns -1 if any escapes were found which cause the string to
5700 pop out of ASCII range. Otherwise returns the length of the
5701 required buffer to hold the string.
5702 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005703static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005704length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5705{
5706 const unsigned char *p = (const unsigned char *)s;
5707 const unsigned char *end = p + size;
5708 Py_ssize_t length = 0;
5709
5710 if (size < 0)
5711 return -1;
5712
5713 for (; p < end; ++p) {
5714 if (*p > 127) {
5715 /* Non-ASCII */
5716 return -1;
5717 }
5718 else if (*p != '\\') {
5719 /* Normal character */
5720 ++length;
5721 }
5722 else {
5723 /* Backslash-escape, check next char */
5724 ++p;
5725 /* Escape sequence reaches till end of string or
5726 non-ASCII follow-up. */
5727 if (p >= end || *p > 127)
5728 return -1;
5729 switch (*p) {
5730 case '\n':
5731 /* backslash + \n result in zero characters */
5732 break;
5733 case '\\': case '\'': case '\"':
5734 case 'b': case 'f': case 't':
5735 case 'n': case 'r': case 'v': case 'a':
5736 ++length;
5737 break;
5738 case '0': case '1': case '2': case '3':
5739 case '4': case '5': case '6': case '7':
5740 case 'x': case 'u': case 'U': case 'N':
5741 /* these do not guarantee ASCII characters */
5742 return -1;
5743 default:
5744 /* count the backslash + the other character */
5745 length += 2;
5746 }
5747 }
5748 }
5749 return length;
5750}
5751
Fredrik Lundh06d12682001-01-24 07:59:11 +00005752static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005753
Alexander Belopolsky40018472011-02-26 01:02:56 +00005754PyObject *
5755PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005756 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005757 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005758{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005759 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005760 Py_ssize_t startinpos;
5761 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005762 int j;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005763 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005764 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005765 char* message;
5766 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005767 PyObject *errorHandler = NULL;
5768 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005769 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005770 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005771
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005772 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005773
5774 /* After length_of_escaped_ascii_string() there are two alternatives,
5775 either the string is pure ASCII with named escapes like \n, etc.
5776 and we determined it's exact size (common case)
5777 or it contains \x, \u, ... escape sequences. then we create a
5778 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005779 if (len >= 0) {
5780 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005781 if (!v)
5782 goto onError;
5783 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005784 }
5785 else {
5786 /* Escaped strings will always be longer than the resulting
5787 Unicode string, so we start with size here and then reduce the
5788 length after conversion to the true value.
5789 (but if the error callback returns a long replacement string
5790 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005791 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005792 if (!v)
5793 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005794 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005795 }
5796
Guido van Rossumd57fd912000-03-10 22:53:23 +00005797 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005798 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005799 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005800 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005801
Guido van Rossumd57fd912000-03-10 22:53:23 +00005802 while (s < end) {
5803 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005804 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005805 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005806
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005807 /* The only case in which i == ascii_length is a backslash
5808 followed by a newline. */
5809 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005810
Guido van Rossumd57fd912000-03-10 22:53:23 +00005811 /* Non-escape characters are interpreted as Unicode ordinals */
5812 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005813 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5814 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005815 continue;
5816 }
5817
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005818 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005819 /* \ - Escapes */
5820 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005821 c = *s++;
5822 if (s > end)
5823 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005824
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005825 /* The only case in which i == ascii_length is a backslash
5826 followed by a newline. */
5827 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005828
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005829 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005830
Benjamin Peterson29060642009-01-31 22:14:21 +00005831 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005832#define WRITECHAR(ch) \
5833 do { \
5834 if (unicode_putchar(&v, &i, ch) < 0) \
5835 goto onError; \
5836 }while(0)
5837
Guido van Rossumd57fd912000-03-10 22:53:23 +00005838 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005839 case '\\': WRITECHAR('\\'); break;
5840 case '\'': WRITECHAR('\''); break;
5841 case '\"': WRITECHAR('\"'); break;
5842 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005843 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005844 case 'f': WRITECHAR('\014'); break;
5845 case 't': WRITECHAR('\t'); break;
5846 case 'n': WRITECHAR('\n'); break;
5847 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005848 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005849 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005850 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005851 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005852
Benjamin Peterson29060642009-01-31 22:14:21 +00005853 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005854 case '0': case '1': case '2': case '3':
5855 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005856 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005857 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005858 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005859 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005860 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005861 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005862 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005863 break;
5864
Benjamin Peterson29060642009-01-31 22:14:21 +00005865 /* hex escapes */
5866 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005867 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005868 digits = 2;
5869 message = "truncated \\xXX escape";
5870 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005871
Benjamin Peterson29060642009-01-31 22:14:21 +00005872 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005873 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005874 digits = 4;
5875 message = "truncated \\uXXXX escape";
5876 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005877
Benjamin Peterson29060642009-01-31 22:14:21 +00005878 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005879 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005880 digits = 8;
5881 message = "truncated \\UXXXXXXXX escape";
5882 hexescape:
5883 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005884 if (s+digits>end) {
5885 endinpos = size;
5886 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005887 errors, &errorHandler,
5888 "unicodeescape", "end of string in escape sequence",
5889 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005890 &v, &i))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005891 goto onError;
5892 goto nextByte;
5893 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005894 for (j = 0; j < digits; ++j) {
5895 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005896 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005897 endinpos = (s+j+1)-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005898 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005899 errors, &errorHandler,
5900 "unicodeescape", message,
5901 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005902 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005903 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005904 len = PyUnicode_GET_LENGTH(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005905 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005906 }
5907 chr = (chr<<4) & ~0xF;
5908 if (c >= '0' && c <= '9')
5909 chr += c - '0';
5910 else if (c >= 'a' && c <= 'f')
5911 chr += 10 + c - 'a';
5912 else
5913 chr += 10 + c - 'A';
5914 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005915 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005916 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005917 /* _decoding_error will have already written into the
5918 target buffer. */
5919 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005920 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005921 /* when we get here, chr is a 32-bit unicode character */
Victor Stinner8faf8212011-12-08 22:14:11 +01005922 if (chr <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005923 WRITECHAR(chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005924 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005925 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005926 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005927 errors, &errorHandler,
5928 "unicodeescape", "illegal Unicode character",
5929 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005930 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005931 goto onError;
5932 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005933 break;
5934
Benjamin Peterson29060642009-01-31 22:14:21 +00005935 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005936 case 'N':
5937 message = "malformed \\N character escape";
5938 if (ucnhash_CAPI == NULL) {
5939 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005940 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5941 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005942 if (ucnhash_CAPI == NULL)
5943 goto ucnhashError;
5944 }
5945 if (*s == '{') {
5946 const char *start = s+1;
5947 /* look for the closing brace */
5948 while (*s != '}' && s < end)
5949 s++;
5950 if (s > start && s < end && *s == '}') {
5951 /* found a name. look it up in the unicode database */
5952 message = "unknown Unicode character name";
5953 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005954 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005955 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005956 goto store;
5957 }
5958 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005959 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005960 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005961 errors, &errorHandler,
5962 "unicodeescape", message,
5963 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005964 &v, &i))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005965 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005966 break;
5967
5968 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005969 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005970 message = "\\ at end of string";
5971 s--;
5972 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005973 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005974 errors, &errorHandler,
5975 "unicodeescape", message,
5976 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005977 &v, &i))
Walter Dörwald8c077222002-03-25 11:16:18 +00005978 goto onError;
5979 }
5980 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005981 WRITECHAR('\\');
5982 WRITECHAR(s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005983 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005984 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005985 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005986 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005987 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005988 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005989#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005990
Victor Stinner16e6a802011-12-12 13:24:15 +01005991 if (unicode_resize(&v, i) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005992 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005993 Py_XDECREF(errorHandler);
5994 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005995 return unicode_result(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005996
Benjamin Peterson29060642009-01-31 22:14:21 +00005997 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005998 PyErr_SetString(
5999 PyExc_UnicodeError,
6000 "\\N escapes not supported (can't load unicodedata module)"
6001 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00006002 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006003 Py_XDECREF(errorHandler);
6004 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00006005 return NULL;
6006
Benjamin Peterson29060642009-01-31 22:14:21 +00006007 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006008 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006009 Py_XDECREF(errorHandler);
6010 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006011 return NULL;
6012}
6013
6014/* Return a Unicode-Escape string version of the Unicode object.
6015
6016 If quotes is true, the string is enclosed in u"" or u'' quotes as
6017 appropriate.
6018
6019*/
6020
Alexander Belopolsky40018472011-02-26 01:02:56 +00006021PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006022PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006023{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006024 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006025 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006026 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006027 int kind;
6028 void *data;
6029 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006030
Thomas Wouters89f507f2006-12-13 04:49:30 +00006031 /* Initial allocation is based on the longest-possible unichr
6032 escape.
6033
6034 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
6035 unichr, so in this case it's the longest unichr escape. In
6036 narrow (UTF-16) builds this is five chars per source unichr
6037 since there are two unichrs in the surrogate pair, so in narrow
6038 (UTF-16) builds it's not the longest unichr escape.
6039
6040 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
6041 so in the narrow (UTF-16) build case it's the longest unichr
6042 escape.
6043 */
6044
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006045 if (!PyUnicode_Check(unicode)) {
6046 PyErr_BadArgument();
6047 return NULL;
6048 }
6049 if (PyUnicode_READY(unicode) < 0)
6050 return NULL;
6051 len = PyUnicode_GET_LENGTH(unicode);
6052 kind = PyUnicode_KIND(unicode);
6053 data = PyUnicode_DATA(unicode);
6054 switch(kind) {
6055 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
6056 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
6057 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
6058 }
6059
6060 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006061 return PyBytes_FromStringAndSize(NULL, 0);
6062
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006063 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006064 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006065
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006066 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00006067 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006068 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00006069 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006070 if (repr == NULL)
6071 return NULL;
6072
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006073 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006074
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006075 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006076 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006077
Walter Dörwald79e913e2007-05-12 11:08:06 +00006078 /* Escape backslashes */
6079 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006080 *p++ = '\\';
6081 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00006082 continue;
Tim Petersced69f82003-09-16 20:30:58 +00006083 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006084
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006085 /* Map 21-bit characters to '\U00xxxxxx' */
6086 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006087 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006088 *p++ = '\\';
6089 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006090 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
6091 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
6092 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6093 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6094 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6095 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6096 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6097 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00006098 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006099 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006100
Guido van Rossumd57fd912000-03-10 22:53:23 +00006101 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006102 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006103 *p++ = '\\';
6104 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006105 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6106 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6107 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6108 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006109 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006110
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006111 /* Map special whitespace to '\t', \n', '\r' */
6112 else if (ch == '\t') {
6113 *p++ = '\\';
6114 *p++ = 't';
6115 }
6116 else if (ch == '\n') {
6117 *p++ = '\\';
6118 *p++ = 'n';
6119 }
6120 else if (ch == '\r') {
6121 *p++ = '\\';
6122 *p++ = 'r';
6123 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006124
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006125 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006126 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006127 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006128 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006129 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6130 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006131 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006132
Guido van Rossumd57fd912000-03-10 22:53:23 +00006133 /* Copy everything else as-is */
6134 else
6135 *p++ = (char) ch;
6136 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006137
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006138 assert(p - PyBytes_AS_STRING(repr) > 0);
6139 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6140 return NULL;
6141 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006142}
6143
Alexander Belopolsky40018472011-02-26 01:02:56 +00006144PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006145PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6146 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006147{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006148 PyObject *result;
6149 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6150 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006151 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006152 result = PyUnicode_AsUnicodeEscapeString(tmp);
6153 Py_DECREF(tmp);
6154 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006155}
6156
6157/* --- Raw Unicode Escape Codec ------------------------------------------- */
6158
Alexander Belopolsky40018472011-02-26 01:02:56 +00006159PyObject *
6160PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006161 Py_ssize_t size,
6162 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006163{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006164 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006165 Py_ssize_t startinpos;
6166 Py_ssize_t endinpos;
6167 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006168 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006169 const char *end;
6170 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006171 PyObject *errorHandler = NULL;
6172 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006173
Guido van Rossumd57fd912000-03-10 22:53:23 +00006174 /* Escaped strings will always be longer than the resulting
6175 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006176 length after conversion to the true value. (But decoding error
6177 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006178 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006179 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006180 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006181 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006182 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006183 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006184 end = s + size;
6185 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006186 unsigned char c;
6187 Py_UCS4 x;
6188 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006189 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006190
Benjamin Peterson29060642009-01-31 22:14:21 +00006191 /* Non-escape characters are interpreted as Unicode ordinals */
6192 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006193 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6194 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006195 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006196 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006197 startinpos = s-starts;
6198
6199 /* \u-escapes are only interpreted iff the number of leading
6200 backslashes if odd */
6201 bs = s;
6202 for (;s < end;) {
6203 if (*s != '\\')
6204 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006205 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6206 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006207 }
6208 if (((s - bs) & 1) == 0 ||
6209 s >= end ||
6210 (*s != 'u' && *s != 'U')) {
6211 continue;
6212 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006213 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006214 count = *s=='u' ? 4 : 8;
6215 s++;
6216
6217 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006218 for (x = 0, i = 0; i < count; ++i, ++s) {
6219 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006220 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006221 endinpos = s-starts;
6222 if (unicode_decode_call_errorhandler(
6223 errors, &errorHandler,
6224 "rawunicodeescape", "truncated \\uXXXX",
6225 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006226 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006227 goto onError;
6228 goto nextByte;
6229 }
6230 x = (x<<4) & ~0xF;
6231 if (c >= '0' && c <= '9')
6232 x += c - '0';
6233 else if (c >= 'a' && c <= 'f')
6234 x += 10 + c - 'a';
6235 else
6236 x += 10 + c - 'A';
6237 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006238 if (x <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006239 if (unicode_putchar(&v, &outpos, x) < 0)
6240 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006241 } else {
6242 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006243 if (unicode_decode_call_errorhandler(
6244 errors, &errorHandler,
6245 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006246 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006247 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006248 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006249 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006250 nextByte:
6251 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006252 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006253 if (unicode_resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006254 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006255 Py_XDECREF(errorHandler);
6256 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006257 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00006258
Benjamin Peterson29060642009-01-31 22:14:21 +00006259 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006260 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006261 Py_XDECREF(errorHandler);
6262 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006263 return NULL;
6264}
6265
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006266
Alexander Belopolsky40018472011-02-26 01:02:56 +00006267PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006268PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006269{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006270 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006271 char *p;
6272 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006273 Py_ssize_t expandsize, pos;
6274 int kind;
6275 void *data;
6276 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006277
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006278 if (!PyUnicode_Check(unicode)) {
6279 PyErr_BadArgument();
6280 return NULL;
6281 }
6282 if (PyUnicode_READY(unicode) < 0)
6283 return NULL;
6284 kind = PyUnicode_KIND(unicode);
6285 data = PyUnicode_DATA(unicode);
6286 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006287 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6288 bytes, and 1 byte characters 4. */
6289 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006290
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006291 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006292 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006293
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006294 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006295 if (repr == NULL)
6296 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006297 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006298 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006299
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006300 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006301 for (pos = 0; pos < len; pos++) {
6302 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006303 /* Map 32-bit characters to '\Uxxxxxxxx' */
6304 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006305 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006306 *p++ = '\\';
6307 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006308 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6309 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6310 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6311 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6312 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6313 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6314 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6315 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006316 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006317 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006318 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006319 *p++ = '\\';
6320 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006321 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6322 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6323 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6324 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006325 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006326 /* Copy everything else as-is */
6327 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006328 *p++ = (char) ch;
6329 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006330
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006331 assert(p > q);
6332 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006333 return NULL;
6334 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006335}
6336
Alexander Belopolsky40018472011-02-26 01:02:56 +00006337PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006338PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6339 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006340{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006341 PyObject *result;
6342 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6343 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006344 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006345 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6346 Py_DECREF(tmp);
6347 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006348}
6349
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006350/* --- Unicode Internal Codec ------------------------------------------- */
6351
Alexander Belopolsky40018472011-02-26 01:02:56 +00006352PyObject *
6353_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006354 Py_ssize_t size,
6355 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006356{
6357 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006358 Py_ssize_t startinpos;
6359 Py_ssize_t endinpos;
6360 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006361 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006362 const char *end;
6363 const char *reason;
6364 PyObject *errorHandler = NULL;
6365 PyObject *exc = NULL;
6366
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006367 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006368 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006369 1))
6370 return NULL;
6371
Thomas Wouters89f507f2006-12-13 04:49:30 +00006372 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006373 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006374 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006375 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006376 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006377 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006378 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006379 end = s + size;
6380
6381 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006382 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006383 Py_UCS4 ch;
6384 /* We copy the raw representation one byte at a time because the
6385 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006386 ((char *) &uch)[0] = s[0];
6387 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006388#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006389 ((char *) &uch)[2] = s[2];
6390 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006391#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006392 ch = uch;
6393
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006394 /* We have to sanity check the raw data, otherwise doom looms for
6395 some malformed UCS-4 data. */
6396 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006397#ifdef Py_UNICODE_WIDE
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006398 ch > 0x10ffff ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006399#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006400 end-s < Py_UNICODE_SIZE
6401 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006402 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006403 startinpos = s - starts;
6404 if (end-s < Py_UNICODE_SIZE) {
6405 endinpos = end-starts;
6406 reason = "truncated input";
6407 }
6408 else {
6409 endinpos = s - starts + Py_UNICODE_SIZE;
6410 reason = "illegal code point (> 0x10FFFF)";
6411 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006412 if (unicode_decode_call_errorhandler(
6413 errors, &errorHandler,
6414 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006415 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006416 &v, &outpos))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006417 goto onError;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006418 continue;
6419 }
6420
6421 s += Py_UNICODE_SIZE;
6422#ifndef Py_UNICODE_WIDE
Victor Stinner551ac952011-11-29 22:58:13 +01006423 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006424 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006425 Py_UNICODE uch2;
6426 ((char *) &uch2)[0] = s[0];
6427 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006428 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006429 {
Victor Stinner551ac952011-11-29 22:58:13 +01006430 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006431 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006432 }
6433 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006434#endif
6435
6436 if (unicode_putchar(&v, &outpos, ch) < 0)
6437 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006438 }
6439
Victor Stinner16e6a802011-12-12 13:24:15 +01006440 if (unicode_resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006441 goto onError;
6442 Py_XDECREF(errorHandler);
6443 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006444 return unicode_result(v);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006445
Benjamin Peterson29060642009-01-31 22:14:21 +00006446 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006447 Py_XDECREF(v);
6448 Py_XDECREF(errorHandler);
6449 Py_XDECREF(exc);
6450 return NULL;
6451}
6452
Guido van Rossumd57fd912000-03-10 22:53:23 +00006453/* --- Latin-1 Codec ------------------------------------------------------ */
6454
Alexander Belopolsky40018472011-02-26 01:02:56 +00006455PyObject *
6456PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006457 Py_ssize_t size,
6458 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006460 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006461 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006462}
6463
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006464/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006465static void
6466make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006467 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006468 PyObject *unicode,
6469 Py_ssize_t startpos, Py_ssize_t endpos,
6470 const char *reason)
6471{
6472 if (*exceptionObject == NULL) {
6473 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006474 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006475 encoding, unicode, startpos, endpos, reason);
6476 }
6477 else {
6478 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6479 goto onError;
6480 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6481 goto onError;
6482 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6483 goto onError;
6484 return;
6485 onError:
6486 Py_DECREF(*exceptionObject);
6487 *exceptionObject = NULL;
6488 }
6489}
6490
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006491/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006492static void
6493raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006494 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006495 PyObject *unicode,
6496 Py_ssize_t startpos, Py_ssize_t endpos,
6497 const char *reason)
6498{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006499 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006500 encoding, unicode, startpos, endpos, reason);
6501 if (*exceptionObject != NULL)
6502 PyCodec_StrictErrors(*exceptionObject);
6503}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006504
6505/* error handling callback helper:
6506 build arguments, call the callback and check the arguments,
6507 put the result into newpos and return the replacement string, which
6508 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006509static PyObject *
6510unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006511 PyObject **errorHandler,
6512 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006513 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006514 Py_ssize_t startpos, Py_ssize_t endpos,
6515 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006516{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006517 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006518 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006519 PyObject *restuple;
6520 PyObject *resunicode;
6521
6522 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006523 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006524 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006525 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006526 }
6527
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006528 if (PyUnicode_READY(unicode) < 0)
6529 return NULL;
6530 len = PyUnicode_GET_LENGTH(unicode);
6531
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006532 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006533 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006534 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006535 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006536
6537 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006538 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006539 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006540 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006541 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006542 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006543 Py_DECREF(restuple);
6544 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006545 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006546 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006547 &resunicode, newpos)) {
6548 Py_DECREF(restuple);
6549 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006550 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006551 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6552 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6553 Py_DECREF(restuple);
6554 return NULL;
6555 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006556 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006557 *newpos = len + *newpos;
6558 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006559 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6560 Py_DECREF(restuple);
6561 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006562 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006563 Py_INCREF(resunicode);
6564 Py_DECREF(restuple);
6565 return resunicode;
6566}
6567
Alexander Belopolsky40018472011-02-26 01:02:56 +00006568static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006569unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006570 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006571 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006572{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006573 /* input state */
6574 Py_ssize_t pos=0, size;
6575 int kind;
6576 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006577 /* output object */
6578 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006579 /* pointer into the output */
6580 char *str;
6581 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006582 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006583 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6584 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006585 PyObject *errorHandler = NULL;
6586 PyObject *exc = NULL;
6587 /* the following variable is used for caching string comparisons
6588 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6589 int known_errorHandler = -1;
6590
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006591 if (PyUnicode_READY(unicode) < 0)
6592 return NULL;
6593 size = PyUnicode_GET_LENGTH(unicode);
6594 kind = PyUnicode_KIND(unicode);
6595 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006596 /* allocate enough for a simple encoding without
6597 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006598 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006599 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006600 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006601 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006602 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006603 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006604 ressize = size;
6605
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006606 while (pos < size) {
6607 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006608
Benjamin Peterson29060642009-01-31 22:14:21 +00006609 /* can we encode this? */
6610 if (c<limit) {
6611 /* no overflow check, because we know that the space is enough */
6612 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006613 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006614 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006615 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006616 Py_ssize_t requiredsize;
6617 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006618 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006619 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006620 Py_ssize_t collstart = pos;
6621 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006622 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006623 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006624 ++collend;
6625 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6626 if (known_errorHandler==-1) {
6627 if ((errors==NULL) || (!strcmp(errors, "strict")))
6628 known_errorHandler = 1;
6629 else if (!strcmp(errors, "replace"))
6630 known_errorHandler = 2;
6631 else if (!strcmp(errors, "ignore"))
6632 known_errorHandler = 3;
6633 else if (!strcmp(errors, "xmlcharrefreplace"))
6634 known_errorHandler = 4;
6635 else
6636 known_errorHandler = 0;
6637 }
6638 switch (known_errorHandler) {
6639 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006640 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006641 goto onError;
6642 case 2: /* replace */
6643 while (collstart++<collend)
6644 *str++ = '?'; /* fall through */
6645 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006646 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006647 break;
6648 case 4: /* xmlcharrefreplace */
6649 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006650 /* determine replacement size */
6651 for (i = collstart, repsize = 0; i < collend; ++i) {
6652 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6653 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006654 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006655 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006656 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006657 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006658 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006659 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006660 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006661 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006662 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006663 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006664 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006665 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006666 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006667 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006668 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006669 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006670 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006671 if (requiredsize > ressize) {
6672 if (requiredsize<2*ressize)
6673 requiredsize = 2*ressize;
6674 if (_PyBytes_Resize(&res, requiredsize))
6675 goto onError;
6676 str = PyBytes_AS_STRING(res) + respos;
6677 ressize = requiredsize;
6678 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006679 /* generate replacement */
6680 for (i = collstart; i < collend; ++i) {
6681 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006682 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006683 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006684 break;
6685 default:
6686 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006687 encoding, reason, unicode, &exc,
6688 collstart, collend, &newpos);
6689 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
6690 PyUnicode_READY(repunicode) < 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00006691 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006692 if (PyBytes_Check(repunicode)) {
6693 /* Directly copy bytes result to output. */
6694 repsize = PyBytes_Size(repunicode);
6695 if (repsize > 1) {
6696 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006697 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006698 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6699 Py_DECREF(repunicode);
6700 goto onError;
6701 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006702 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006703 ressize += repsize-1;
6704 }
6705 memcpy(str, PyBytes_AsString(repunicode), repsize);
6706 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006707 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006708 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006709 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006710 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006711 /* need more space? (at least enough for what we
6712 have+the replacement+the rest of the string, so
6713 we won't have to check space for encodable characters) */
6714 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006715 repsize = PyUnicode_GET_LENGTH(repunicode);
6716 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006717 if (requiredsize > ressize) {
6718 if (requiredsize<2*ressize)
6719 requiredsize = 2*ressize;
6720 if (_PyBytes_Resize(&res, requiredsize)) {
6721 Py_DECREF(repunicode);
6722 goto onError;
6723 }
6724 str = PyBytes_AS_STRING(res) + respos;
6725 ressize = requiredsize;
6726 }
6727 /* check if there is anything unencodable in the replacement
6728 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006729 for (i = 0; repsize-->0; ++i, ++str) {
6730 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006731 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006732 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006733 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006734 Py_DECREF(repunicode);
6735 goto onError;
6736 }
6737 *str = (char)c;
6738 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006739 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006740 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006741 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006742 }
6743 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006744 /* Resize if we allocated to much */
6745 size = str - PyBytes_AS_STRING(res);
6746 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006747 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006748 if (_PyBytes_Resize(&res, size) < 0)
6749 goto onError;
6750 }
6751
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006752 Py_XDECREF(errorHandler);
6753 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006754 return res;
6755
6756 onError:
6757 Py_XDECREF(res);
6758 Py_XDECREF(errorHandler);
6759 Py_XDECREF(exc);
6760 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006761}
6762
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006763/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006764PyObject *
6765PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006766 Py_ssize_t size,
6767 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006768{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006769 PyObject *result;
6770 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6771 if (unicode == NULL)
6772 return NULL;
6773 result = unicode_encode_ucs1(unicode, errors, 256);
6774 Py_DECREF(unicode);
6775 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006776}
6777
Alexander Belopolsky40018472011-02-26 01:02:56 +00006778PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006779_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006780{
6781 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006782 PyErr_BadArgument();
6783 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006784 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006785 if (PyUnicode_READY(unicode) == -1)
6786 return NULL;
6787 /* Fast path: if it is a one-byte string, construct
6788 bytes object directly. */
6789 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6790 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6791 PyUnicode_GET_LENGTH(unicode));
6792 /* Non-Latin-1 characters present. Defer to above function to
6793 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006794 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006795}
6796
6797PyObject*
6798PyUnicode_AsLatin1String(PyObject *unicode)
6799{
6800 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006801}
6802
6803/* --- 7-bit ASCII Codec -------------------------------------------------- */
6804
Alexander Belopolsky40018472011-02-26 01:02:56 +00006805PyObject *
6806PyUnicode_DecodeASCII(const char *s,
6807 Py_ssize_t size,
6808 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006809{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006810 const char *starts = s;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006811 PyObject *v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006812 int kind;
6813 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006814 Py_ssize_t startinpos;
6815 Py_ssize_t endinpos;
6816 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006817 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006818 int has_error;
6819 const unsigned char *p = (const unsigned char *)s;
6820 const unsigned char *end = p + size;
6821 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006822 PyObject *errorHandler = NULL;
6823 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006824
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006825 if (size == 0) {
6826 Py_INCREF(unicode_empty);
6827 return unicode_empty;
6828 }
6829
Guido van Rossumd57fd912000-03-10 22:53:23 +00006830 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006831 if (size == 1 && (unsigned char)s[0] < 128)
6832 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006833
Victor Stinner702c7342011-10-05 13:50:52 +02006834 has_error = 0;
6835 while (p < end && !has_error) {
6836 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6837 an explanation. */
6838 if (!((size_t) p & LONG_PTR_MASK)) {
6839 /* Help register allocation */
6840 register const unsigned char *_p = p;
6841 while (_p < aligned_end) {
6842 unsigned long value = *(unsigned long *) _p;
6843 if (value & ASCII_CHAR_MASK) {
6844 has_error = 1;
6845 break;
6846 }
6847 _p += SIZEOF_LONG;
6848 }
6849 if (_p == end)
6850 break;
6851 if (has_error)
6852 break;
6853 p = _p;
6854 }
6855 if (*p & 0x80) {
6856 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006857 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006858 }
6859 else {
6860 ++p;
6861 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006862 }
Victor Stinner702c7342011-10-05 13:50:52 +02006863 if (!has_error)
6864 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006865
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006866 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006867 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006868 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006869 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006870 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006871 kind = PyUnicode_KIND(v);
6872 data = PyUnicode_DATA(v);
6873 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006874 e = s + size;
6875 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006876 register unsigned char c = (unsigned char)*s;
6877 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006878 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006879 ++s;
6880 }
6881 else {
6882 startinpos = s-starts;
6883 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006884 if (unicode_decode_call_errorhandler(
6885 errors, &errorHandler,
6886 "ascii", "ordinal not in range(128)",
6887 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006888 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006889 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006890 kind = PyUnicode_KIND(v);
6891 data = PyUnicode_DATA(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006892 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006893 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006894 if (unicode_resize(&v, outpos) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006895 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006896 Py_XDECREF(errorHandler);
6897 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006898 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01006899 return v;
Tim Petersced69f82003-09-16 20:30:58 +00006900
Benjamin Peterson29060642009-01-31 22:14:21 +00006901 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006902 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006903 Py_XDECREF(errorHandler);
6904 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006905 return NULL;
6906}
6907
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006908/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006909PyObject *
6910PyUnicode_EncodeASCII(const Py_UNICODE *p,
6911 Py_ssize_t size,
6912 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006913{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006914 PyObject *result;
6915 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6916 if (unicode == NULL)
6917 return NULL;
6918 result = unicode_encode_ucs1(unicode, errors, 128);
6919 Py_DECREF(unicode);
6920 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006921}
6922
Alexander Belopolsky40018472011-02-26 01:02:56 +00006923PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006924_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006925{
6926 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006927 PyErr_BadArgument();
6928 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006929 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006930 if (PyUnicode_READY(unicode) == -1)
6931 return NULL;
6932 /* Fast path: if it is an ASCII-only string, construct bytes object
6933 directly. Else defer to above function to raise the exception. */
6934 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6935 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6936 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006937 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006938}
6939
6940PyObject *
6941PyUnicode_AsASCIIString(PyObject *unicode)
6942{
6943 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006944}
6945
Victor Stinner99b95382011-07-04 14:23:54 +02006946#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006947
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006948/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006949
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006950#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006951#define NEED_RETRY
6952#endif
6953
Victor Stinner3a50e702011-10-18 21:21:00 +02006954#ifndef WC_ERR_INVALID_CHARS
6955# define WC_ERR_INVALID_CHARS 0x0080
6956#endif
6957
6958static char*
6959code_page_name(UINT code_page, PyObject **obj)
6960{
6961 *obj = NULL;
6962 if (code_page == CP_ACP)
6963 return "mbcs";
6964 if (code_page == CP_UTF7)
6965 return "CP_UTF7";
6966 if (code_page == CP_UTF8)
6967 return "CP_UTF8";
6968
6969 *obj = PyBytes_FromFormat("cp%u", code_page);
6970 if (*obj == NULL)
6971 return NULL;
6972 return PyBytes_AS_STRING(*obj);
6973}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006974
Alexander Belopolsky40018472011-02-26 01:02:56 +00006975static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006976is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006977{
6978 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006979 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006980
Victor Stinner3a50e702011-10-18 21:21:00 +02006981 if (!IsDBCSLeadByteEx(code_page, *curr))
6982 return 0;
6983
6984 prev = CharPrevExA(code_page, s, curr, 0);
6985 if (prev == curr)
6986 return 1;
6987 /* FIXME: This code is limited to "true" double-byte encodings,
6988 as it assumes an incomplete character consists of a single
6989 byte. */
6990 if (curr - prev == 2)
6991 return 1;
6992 if (!IsDBCSLeadByteEx(code_page, *prev))
6993 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006994 return 0;
6995}
6996
Victor Stinner3a50e702011-10-18 21:21:00 +02006997static DWORD
6998decode_code_page_flags(UINT code_page)
6999{
7000 if (code_page == CP_UTF7) {
7001 /* The CP_UTF7 decoder only supports flags=0 */
7002 return 0;
7003 }
7004 else
7005 return MB_ERR_INVALID_CHARS;
7006}
7007
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007008/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007009 * Decode a byte string from a Windows code page into unicode object in strict
7010 * mode.
7011 *
7012 * Returns consumed size if succeed, returns -2 on decode error, or raise a
7013 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007014 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007015static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007016decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007017 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007018 const char *in,
7019 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007020{
Victor Stinner3a50e702011-10-18 21:21:00 +02007021 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007022 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007023 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007024
7025 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007026 assert(insize > 0);
7027 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7028 if (outsize <= 0)
7029 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007030
7031 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007032 /* Create unicode object */
Victor Stinner76a31a62011-11-04 00:05:13 +01007033 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007034 if (*v == NULL)
7035 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007036 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007037 }
7038 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007039 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007040 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007041 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007042 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007043 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007044 }
7045
7046 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007047 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7048 if (outsize <= 0)
7049 goto error;
7050 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007051
Victor Stinner3a50e702011-10-18 21:21:00 +02007052error:
7053 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7054 return -2;
7055 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007056 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007057}
7058
Victor Stinner3a50e702011-10-18 21:21:00 +02007059/*
7060 * Decode a byte string from a code page into unicode object with an error
7061 * handler.
7062 *
7063 * Returns consumed size if succeed, or raise a WindowsError or
7064 * UnicodeDecodeError exception and returns -1 on error.
7065 */
7066static int
7067decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007068 PyObject **v,
7069 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007070 const char *errors)
7071{
7072 const char *startin = in;
7073 const char *endin = in + size;
7074 const DWORD flags = decode_code_page_flags(code_page);
7075 /* Ideally, we should get reason from FormatMessage. This is the Windows
7076 2000 English version of the message. */
7077 const char *reason = "No mapping for the Unicode character exists "
7078 "in the target code page.";
7079 /* each step cannot decode more than 1 character, but a character can be
7080 represented as a surrogate pair */
7081 wchar_t buffer[2], *startout, *out;
7082 int insize, outsize;
7083 PyObject *errorHandler = NULL;
7084 PyObject *exc = NULL;
7085 PyObject *encoding_obj = NULL;
7086 char *encoding;
7087 DWORD err;
7088 int ret = -1;
7089
7090 assert(size > 0);
7091
7092 encoding = code_page_name(code_page, &encoding_obj);
7093 if (encoding == NULL)
7094 return -1;
7095
7096 if (errors == NULL || strcmp(errors, "strict") == 0) {
7097 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7098 UnicodeDecodeError. */
7099 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7100 if (exc != NULL) {
7101 PyCodec_StrictErrors(exc);
7102 Py_CLEAR(exc);
7103 }
7104 goto error;
7105 }
7106
7107 if (*v == NULL) {
7108 /* Create unicode object */
7109 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7110 PyErr_NoMemory();
7111 goto error;
7112 }
Victor Stinner76a31a62011-11-04 00:05:13 +01007113 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007114 if (*v == NULL)
7115 goto error;
7116 startout = PyUnicode_AS_UNICODE(*v);
7117 }
7118 else {
7119 /* Extend unicode object */
7120 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7121 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7122 PyErr_NoMemory();
7123 goto error;
7124 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007125 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007126 goto error;
7127 startout = PyUnicode_AS_UNICODE(*v) + n;
7128 }
7129
7130 /* Decode the byte string character per character */
7131 out = startout;
7132 while (in < endin)
7133 {
7134 /* Decode a character */
7135 insize = 1;
7136 do
7137 {
7138 outsize = MultiByteToWideChar(code_page, flags,
7139 in, insize,
7140 buffer, Py_ARRAY_LENGTH(buffer));
7141 if (outsize > 0)
7142 break;
7143 err = GetLastError();
7144 if (err != ERROR_NO_UNICODE_TRANSLATION
7145 && err != ERROR_INSUFFICIENT_BUFFER)
7146 {
7147 PyErr_SetFromWindowsErr(0);
7148 goto error;
7149 }
7150 insize++;
7151 }
7152 /* 4=maximum length of a UTF-8 sequence */
7153 while (insize <= 4 && (in + insize) <= endin);
7154
7155 if (outsize <= 0) {
7156 Py_ssize_t startinpos, endinpos, outpos;
7157
7158 startinpos = in - startin;
7159 endinpos = startinpos + 1;
7160 outpos = out - PyUnicode_AS_UNICODE(*v);
7161 if (unicode_decode_call_errorhandler(
7162 errors, &errorHandler,
7163 encoding, reason,
7164 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007165 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007166 {
7167 goto error;
7168 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007169 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007170 }
7171 else {
7172 in += insize;
7173 memcpy(out, buffer, outsize * sizeof(wchar_t));
7174 out += outsize;
7175 }
7176 }
7177
7178 /* write a NUL character at the end */
7179 *out = 0;
7180
7181 /* Extend unicode object */
7182 outsize = out - startout;
7183 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007184 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007185 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01007186 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007187
7188error:
7189 Py_XDECREF(encoding_obj);
7190 Py_XDECREF(errorHandler);
7191 Py_XDECREF(exc);
7192 return ret;
7193}
7194
Victor Stinner3a50e702011-10-18 21:21:00 +02007195static PyObject *
7196decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007197 const char *s, Py_ssize_t size,
7198 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007199{
Victor Stinner76a31a62011-11-04 00:05:13 +01007200 PyObject *v = NULL;
7201 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007202
Victor Stinner3a50e702011-10-18 21:21:00 +02007203 if (code_page < 0) {
7204 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7205 return NULL;
7206 }
7207
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007208 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007209 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007210
Victor Stinner76a31a62011-11-04 00:05:13 +01007211 do
7212 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007213#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007214 if (size > INT_MAX) {
7215 chunk_size = INT_MAX;
7216 final = 0;
7217 done = 0;
7218 }
7219 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007220#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007221 {
7222 chunk_size = (int)size;
7223 final = (consumed == NULL);
7224 done = 1;
7225 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007226
Victor Stinner76a31a62011-11-04 00:05:13 +01007227 /* Skip trailing lead-byte unless 'final' is set */
7228 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
7229 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007230
Victor Stinner76a31a62011-11-04 00:05:13 +01007231 if (chunk_size == 0 && done) {
7232 if (v != NULL)
7233 break;
7234 Py_INCREF(unicode_empty);
7235 return unicode_empty;
7236 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007237
Victor Stinner76a31a62011-11-04 00:05:13 +01007238
7239 converted = decode_code_page_strict(code_page, &v,
7240 s, chunk_size);
7241 if (converted == -2)
7242 converted = decode_code_page_errors(code_page, &v,
7243 s, chunk_size,
7244 errors);
7245 assert(converted != 0);
7246
7247 if (converted < 0) {
7248 Py_XDECREF(v);
7249 return NULL;
7250 }
7251
7252 if (consumed)
7253 *consumed += converted;
7254
7255 s += converted;
7256 size -= converted;
7257 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007258
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007259 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007260}
7261
Alexander Belopolsky40018472011-02-26 01:02:56 +00007262PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007263PyUnicode_DecodeCodePageStateful(int code_page,
7264 const char *s,
7265 Py_ssize_t size,
7266 const char *errors,
7267 Py_ssize_t *consumed)
7268{
7269 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7270}
7271
7272PyObject *
7273PyUnicode_DecodeMBCSStateful(const char *s,
7274 Py_ssize_t size,
7275 const char *errors,
7276 Py_ssize_t *consumed)
7277{
7278 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7279}
7280
7281PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007282PyUnicode_DecodeMBCS(const char *s,
7283 Py_ssize_t size,
7284 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007285{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007286 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7287}
7288
Victor Stinner3a50e702011-10-18 21:21:00 +02007289static DWORD
7290encode_code_page_flags(UINT code_page, const char *errors)
7291{
7292 if (code_page == CP_UTF8) {
7293 if (winver.dwMajorVersion >= 6)
7294 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7295 and later */
7296 return WC_ERR_INVALID_CHARS;
7297 else
7298 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7299 return 0;
7300 }
7301 else if (code_page == CP_UTF7) {
7302 /* CP_UTF7 only supports flags=0 */
7303 return 0;
7304 }
7305 else {
7306 if (errors != NULL && strcmp(errors, "replace") == 0)
7307 return 0;
7308 else
7309 return WC_NO_BEST_FIT_CHARS;
7310 }
7311}
7312
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007313/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007314 * Encode a Unicode string to a Windows code page into a byte string in strict
7315 * mode.
7316 *
7317 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7318 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007319 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007320static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007321encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007322 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007323 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007324{
Victor Stinner554f3f02010-06-16 23:33:54 +00007325 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007326 BOOL *pusedDefaultChar = &usedDefaultChar;
7327 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007328 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007329 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007330 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007331 const DWORD flags = encode_code_page_flags(code_page, NULL);
7332 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007333 /* Create a substring so that we can get the UTF-16 representation
7334 of just the slice under consideration. */
7335 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007336
Martin v. Löwis3d325192011-11-04 18:23:06 +01007337 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007338
Victor Stinner3a50e702011-10-18 21:21:00 +02007339 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007340 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007341 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007342 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007343
Victor Stinner2fc507f2011-11-04 20:06:39 +01007344 substring = PyUnicode_Substring(unicode, offset, offset+len);
7345 if (substring == NULL)
7346 return -1;
7347 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7348 if (p == NULL) {
7349 Py_DECREF(substring);
7350 return -1;
7351 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01007352
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007353 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007354 outsize = WideCharToMultiByte(code_page, flags,
7355 p, size,
7356 NULL, 0,
7357 NULL, pusedDefaultChar);
7358 if (outsize <= 0)
7359 goto error;
7360 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007361 if (pusedDefaultChar && *pusedDefaultChar) {
7362 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007363 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007364 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007365
Victor Stinner3a50e702011-10-18 21:21:00 +02007366 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007367 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007368 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007369 if (*outbytes == NULL) {
7370 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007371 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007372 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007373 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007374 }
7375 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007376 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007377 const Py_ssize_t n = PyBytes_Size(*outbytes);
7378 if (outsize > PY_SSIZE_T_MAX - n) {
7379 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007380 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007381 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007382 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007383 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7384 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007385 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007386 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007387 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007388 }
7389
7390 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007391 outsize = WideCharToMultiByte(code_page, flags,
7392 p, size,
7393 out, outsize,
7394 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007395 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007396 if (outsize <= 0)
7397 goto error;
7398 if (pusedDefaultChar && *pusedDefaultChar)
7399 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007400 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007401
Victor Stinner3a50e702011-10-18 21:21:00 +02007402error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007403 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007404 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7405 return -2;
7406 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007407 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007408}
7409
Victor Stinner3a50e702011-10-18 21:21:00 +02007410/*
7411 * Encode a Unicode string to a Windows code page into a byte string using a
7412 * error handler.
7413 *
7414 * Returns consumed characters if succeed, or raise a WindowsError and returns
7415 * -1 on other error.
7416 */
7417static int
7418encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007419 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007420 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007421{
Victor Stinner3a50e702011-10-18 21:21:00 +02007422 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007423 Py_ssize_t pos = unicode_offset;
7424 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007425 /* Ideally, we should get reason from FormatMessage. This is the Windows
7426 2000 English version of the message. */
7427 const char *reason = "invalid character";
7428 /* 4=maximum length of a UTF-8 sequence */
7429 char buffer[4];
7430 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7431 Py_ssize_t outsize;
7432 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007433 PyObject *errorHandler = NULL;
7434 PyObject *exc = NULL;
7435 PyObject *encoding_obj = NULL;
7436 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007437 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007438 PyObject *rep;
7439 int ret = -1;
7440
7441 assert(insize > 0);
7442
7443 encoding = code_page_name(code_page, &encoding_obj);
7444 if (encoding == NULL)
7445 return -1;
7446
7447 if (errors == NULL || strcmp(errors, "strict") == 0) {
7448 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7449 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007450 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007451 if (exc != NULL) {
7452 PyCodec_StrictErrors(exc);
7453 Py_DECREF(exc);
7454 }
7455 Py_XDECREF(encoding_obj);
7456 return -1;
7457 }
7458
7459 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7460 pusedDefaultChar = &usedDefaultChar;
7461 else
7462 pusedDefaultChar = NULL;
7463
7464 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7465 PyErr_NoMemory();
7466 goto error;
7467 }
7468 outsize = insize * Py_ARRAY_LENGTH(buffer);
7469
7470 if (*outbytes == NULL) {
7471 /* Create string object */
7472 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7473 if (*outbytes == NULL)
7474 goto error;
7475 out = PyBytes_AS_STRING(*outbytes);
7476 }
7477 else {
7478 /* Extend string object */
7479 Py_ssize_t n = PyBytes_Size(*outbytes);
7480 if (n > PY_SSIZE_T_MAX - outsize) {
7481 PyErr_NoMemory();
7482 goto error;
7483 }
7484 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7485 goto error;
7486 out = PyBytes_AS_STRING(*outbytes) + n;
7487 }
7488
7489 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007490 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007491 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007492 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7493 wchar_t chars[2];
7494 int charsize;
7495 if (ch < 0x10000) {
7496 chars[0] = (wchar_t)ch;
7497 charsize = 1;
7498 }
7499 else {
7500 ch -= 0x10000;
7501 chars[0] = 0xd800 + (ch >> 10);
7502 chars[1] = 0xdc00 + (ch & 0x3ff);
7503 charsize = 2;
7504 }
7505
Victor Stinner3a50e702011-10-18 21:21:00 +02007506 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007507 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007508 buffer, Py_ARRAY_LENGTH(buffer),
7509 NULL, pusedDefaultChar);
7510 if (outsize > 0) {
7511 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7512 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007513 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007514 memcpy(out, buffer, outsize);
7515 out += outsize;
7516 continue;
7517 }
7518 }
7519 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7520 PyErr_SetFromWindowsErr(0);
7521 goto error;
7522 }
7523
Victor Stinner3a50e702011-10-18 21:21:00 +02007524 rep = unicode_encode_call_errorhandler(
7525 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007526 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007527 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007528 if (rep == NULL)
7529 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007530 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007531
7532 if (PyBytes_Check(rep)) {
7533 outsize = PyBytes_GET_SIZE(rep);
7534 if (outsize != 1) {
7535 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7536 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7537 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7538 Py_DECREF(rep);
7539 goto error;
7540 }
7541 out = PyBytes_AS_STRING(*outbytes) + offset;
7542 }
7543 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7544 out += outsize;
7545 }
7546 else {
7547 Py_ssize_t i;
7548 enum PyUnicode_Kind kind;
7549 void *data;
7550
7551 if (PyUnicode_READY(rep) < 0) {
7552 Py_DECREF(rep);
7553 goto error;
7554 }
7555
7556 outsize = PyUnicode_GET_LENGTH(rep);
7557 if (outsize != 1) {
7558 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7559 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7560 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7561 Py_DECREF(rep);
7562 goto error;
7563 }
7564 out = PyBytes_AS_STRING(*outbytes) + offset;
7565 }
7566 kind = PyUnicode_KIND(rep);
7567 data = PyUnicode_DATA(rep);
7568 for (i=0; i < outsize; i++) {
7569 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7570 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007571 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007572 encoding, unicode,
7573 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007574 "unable to encode error handler result to ASCII");
7575 Py_DECREF(rep);
7576 goto error;
7577 }
7578 *out = (unsigned char)ch;
7579 out++;
7580 }
7581 }
7582 Py_DECREF(rep);
7583 }
7584 /* write a NUL byte */
7585 *out = 0;
7586 outsize = out - PyBytes_AS_STRING(*outbytes);
7587 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7588 if (_PyBytes_Resize(outbytes, outsize) < 0)
7589 goto error;
7590 ret = 0;
7591
7592error:
7593 Py_XDECREF(encoding_obj);
7594 Py_XDECREF(errorHandler);
7595 Py_XDECREF(exc);
7596 return ret;
7597}
7598
Victor Stinner3a50e702011-10-18 21:21:00 +02007599static PyObject *
7600encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007601 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007602 const char *errors)
7603{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007604 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007605 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007606 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007607 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007608
Victor Stinner2fc507f2011-11-04 20:06:39 +01007609 if (PyUnicode_READY(unicode) < 0)
7610 return NULL;
7611 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007612
Victor Stinner3a50e702011-10-18 21:21:00 +02007613 if (code_page < 0) {
7614 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7615 return NULL;
7616 }
7617
Martin v. Löwis3d325192011-11-04 18:23:06 +01007618 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007619 return PyBytes_FromStringAndSize(NULL, 0);
7620
Victor Stinner7581cef2011-11-03 22:32:33 +01007621 offset = 0;
7622 do
7623 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007624#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007625 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007626 chunks. */
7627 if (len > INT_MAX/2) {
7628 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007629 done = 0;
7630 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007631 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007632#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007633 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007634 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007635 done = 1;
7636 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007637
Victor Stinner76a31a62011-11-04 00:05:13 +01007638 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007639 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007640 errors);
7641 if (ret == -2)
7642 ret = encode_code_page_errors(code_page, &outbytes,
7643 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007644 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007645 if (ret < 0) {
7646 Py_XDECREF(outbytes);
7647 return NULL;
7648 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007649
Victor Stinner7581cef2011-11-03 22:32:33 +01007650 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007651 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007652 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007653
Victor Stinner3a50e702011-10-18 21:21:00 +02007654 return outbytes;
7655}
7656
7657PyObject *
7658PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7659 Py_ssize_t size,
7660 const char *errors)
7661{
Victor Stinner7581cef2011-11-03 22:32:33 +01007662 PyObject *unicode, *res;
7663 unicode = PyUnicode_FromUnicode(p, size);
7664 if (unicode == NULL)
7665 return NULL;
7666 res = encode_code_page(CP_ACP, unicode, errors);
7667 Py_DECREF(unicode);
7668 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007669}
7670
7671PyObject *
7672PyUnicode_EncodeCodePage(int code_page,
7673 PyObject *unicode,
7674 const char *errors)
7675{
Victor Stinner7581cef2011-11-03 22:32:33 +01007676 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007677}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007678
Alexander Belopolsky40018472011-02-26 01:02:56 +00007679PyObject *
7680PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007681{
7682 if (!PyUnicode_Check(unicode)) {
7683 PyErr_BadArgument();
7684 return NULL;
7685 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007686 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007687}
7688
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007689#undef NEED_RETRY
7690
Victor Stinner99b95382011-07-04 14:23:54 +02007691#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007692
Guido van Rossumd57fd912000-03-10 22:53:23 +00007693/* --- Character Mapping Codec -------------------------------------------- */
7694
Alexander Belopolsky40018472011-02-26 01:02:56 +00007695PyObject *
7696PyUnicode_DecodeCharmap(const char *s,
7697 Py_ssize_t size,
7698 PyObject *mapping,
7699 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007700{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007701 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007702 Py_ssize_t startinpos;
7703 Py_ssize_t endinpos;
7704 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007705 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007706 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007707 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007708 PyObject *errorHandler = NULL;
7709 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007710
Guido van Rossumd57fd912000-03-10 22:53:23 +00007711 /* Default to Latin-1 */
7712 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007713 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007714
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007715 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007716 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007717 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007718 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007719 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007720 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007721 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007722 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007723 Py_ssize_t maplen;
7724 enum PyUnicode_Kind kind;
7725 void *data;
7726 Py_UCS4 x;
7727
7728 if (PyUnicode_READY(mapping) < 0)
7729 return NULL;
7730
7731 maplen = PyUnicode_GET_LENGTH(mapping);
7732 data = PyUnicode_DATA(mapping);
7733 kind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007734 while (s < e) {
7735 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007736
Benjamin Peterson29060642009-01-31 22:14:21 +00007737 if (ch < maplen)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007738 x = PyUnicode_READ(kind, data, ch);
7739 else
7740 x = 0xfffe; /* invalid value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007741
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007742 if (x == 0xfffe)
7743 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007744 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007745 startinpos = s-starts;
7746 endinpos = startinpos+1;
7747 if (unicode_decode_call_errorhandler(
7748 errors, &errorHandler,
7749 "charmap", "character maps to <undefined>",
7750 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007751 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007752 goto onError;
7753 }
7754 continue;
7755 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007756
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007757 if (unicode_putchar(&v, &outpos, x) < 0)
7758 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007759 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007760 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007761 }
7762 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007763 while (s < e) {
7764 unsigned char ch = *s;
7765 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007766
Benjamin Peterson29060642009-01-31 22:14:21 +00007767 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7768 w = PyLong_FromLong((long)ch);
7769 if (w == NULL)
7770 goto onError;
7771 x = PyObject_GetItem(mapping, w);
7772 Py_DECREF(w);
7773 if (x == NULL) {
7774 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7775 /* No mapping found means: mapping is undefined. */
7776 PyErr_Clear();
7777 x = Py_None;
7778 Py_INCREF(x);
7779 } else
7780 goto onError;
7781 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007782
Benjamin Peterson29060642009-01-31 22:14:21 +00007783 /* Apply mapping */
7784 if (PyLong_Check(x)) {
7785 long value = PyLong_AS_LONG(x);
7786 if (value < 0 || value > 65535) {
7787 PyErr_SetString(PyExc_TypeError,
7788 "character mapping must be in range(65536)");
7789 Py_DECREF(x);
7790 goto onError;
7791 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007792 if (unicode_putchar(&v, &outpos, value) < 0)
7793 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007794 }
7795 else if (x == Py_None) {
7796 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007797 startinpos = s-starts;
7798 endinpos = startinpos+1;
7799 if (unicode_decode_call_errorhandler(
7800 errors, &errorHandler,
7801 "charmap", "character maps to <undefined>",
7802 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007803 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007804 Py_DECREF(x);
7805 goto onError;
7806 }
7807 Py_DECREF(x);
7808 continue;
7809 }
7810 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007811 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007812
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007813 if (PyUnicode_READY(x) < 0)
7814 goto onError;
7815 targetsize = PyUnicode_GET_LENGTH(x);
7816
7817 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007818 /* 1-1 mapping */
Victor Stinner62aa4d02011-11-09 00:03:45 +01007819 if (unicode_putchar(&v, &outpos,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007820 PyUnicode_READ_CHAR(x, 0)) < 0)
7821 goto onError;
7822 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007823 else if (targetsize > 1) {
7824 /* 1-n mapping */
7825 if (targetsize > extrachars) {
7826 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007827 Py_ssize_t needed = (targetsize - extrachars) + \
7828 (targetsize << 2);
7829 extrachars += needed;
7830 /* XXX overflow detection missing */
Victor Stinner16e6a802011-12-12 13:24:15 +01007831 if (unicode_resize(&v,
7832 PyUnicode_GET_LENGTH(v) + needed) < 0)
7833 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007834 Py_DECREF(x);
7835 goto onError;
7836 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007837 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007838 if (unicode_widen(&v, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
7839 goto onError;
7840 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7841 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00007842 extrachars -= targetsize;
7843 }
7844 /* 1-0 mapping: skip the character */
7845 }
7846 else {
7847 /* wrong return value */
7848 PyErr_SetString(PyExc_TypeError,
7849 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007850 Py_DECREF(x);
7851 goto onError;
7852 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007853 Py_DECREF(x);
7854 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007855 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007856 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007857 if (unicode_resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01007858 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007859 Py_XDECREF(errorHandler);
7860 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007861 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00007862
Benjamin Peterson29060642009-01-31 22:14:21 +00007863 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007864 Py_XDECREF(errorHandler);
7865 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007866 Py_XDECREF(v);
7867 return NULL;
7868}
7869
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007870/* Charmap encoding: the lookup table */
7871
Alexander Belopolsky40018472011-02-26 01:02:56 +00007872struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007873 PyObject_HEAD
7874 unsigned char level1[32];
7875 int count2, count3;
7876 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007877};
7878
7879static PyObject*
7880encoding_map_size(PyObject *obj, PyObject* args)
7881{
7882 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007883 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007884 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007885}
7886
7887static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007888 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007889 PyDoc_STR("Return the size (in bytes) of this object") },
7890 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007891};
7892
7893static void
7894encoding_map_dealloc(PyObject* o)
7895{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007896 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007897}
7898
7899static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007900 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007901 "EncodingMap", /*tp_name*/
7902 sizeof(struct encoding_map), /*tp_basicsize*/
7903 0, /*tp_itemsize*/
7904 /* methods */
7905 encoding_map_dealloc, /*tp_dealloc*/
7906 0, /*tp_print*/
7907 0, /*tp_getattr*/
7908 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007909 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007910 0, /*tp_repr*/
7911 0, /*tp_as_number*/
7912 0, /*tp_as_sequence*/
7913 0, /*tp_as_mapping*/
7914 0, /*tp_hash*/
7915 0, /*tp_call*/
7916 0, /*tp_str*/
7917 0, /*tp_getattro*/
7918 0, /*tp_setattro*/
7919 0, /*tp_as_buffer*/
7920 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7921 0, /*tp_doc*/
7922 0, /*tp_traverse*/
7923 0, /*tp_clear*/
7924 0, /*tp_richcompare*/
7925 0, /*tp_weaklistoffset*/
7926 0, /*tp_iter*/
7927 0, /*tp_iternext*/
7928 encoding_map_methods, /*tp_methods*/
7929 0, /*tp_members*/
7930 0, /*tp_getset*/
7931 0, /*tp_base*/
7932 0, /*tp_dict*/
7933 0, /*tp_descr_get*/
7934 0, /*tp_descr_set*/
7935 0, /*tp_dictoffset*/
7936 0, /*tp_init*/
7937 0, /*tp_alloc*/
7938 0, /*tp_new*/
7939 0, /*tp_free*/
7940 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007941};
7942
7943PyObject*
7944PyUnicode_BuildEncodingMap(PyObject* string)
7945{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007946 PyObject *result;
7947 struct encoding_map *mresult;
7948 int i;
7949 int need_dict = 0;
7950 unsigned char level1[32];
7951 unsigned char level2[512];
7952 unsigned char *mlevel1, *mlevel2, *mlevel3;
7953 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007954 int kind;
7955 void *data;
7956 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007957
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007958 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007959 PyErr_BadArgument();
7960 return NULL;
7961 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007962 kind = PyUnicode_KIND(string);
7963 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007964 memset(level1, 0xFF, sizeof level1);
7965 memset(level2, 0xFF, sizeof level2);
7966
7967 /* If there isn't a one-to-one mapping of NULL to \0,
7968 or if there are non-BMP characters, we need to use
7969 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007970 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007971 need_dict = 1;
7972 for (i = 1; i < 256; i++) {
7973 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007974 ch = PyUnicode_READ(kind, data, i);
7975 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007976 need_dict = 1;
7977 break;
7978 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007979 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007980 /* unmapped character */
7981 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007982 l1 = ch >> 11;
7983 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007984 if (level1[l1] == 0xFF)
7985 level1[l1] = count2++;
7986 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007987 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007988 }
7989
7990 if (count2 >= 0xFF || count3 >= 0xFF)
7991 need_dict = 1;
7992
7993 if (need_dict) {
7994 PyObject *result = PyDict_New();
7995 PyObject *key, *value;
7996 if (!result)
7997 return NULL;
7998 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007999 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008000 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008001 if (!key || !value)
8002 goto failed1;
8003 if (PyDict_SetItem(result, key, value) == -1)
8004 goto failed1;
8005 Py_DECREF(key);
8006 Py_DECREF(value);
8007 }
8008 return result;
8009 failed1:
8010 Py_XDECREF(key);
8011 Py_XDECREF(value);
8012 Py_DECREF(result);
8013 return NULL;
8014 }
8015
8016 /* Create a three-level trie */
8017 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8018 16*count2 + 128*count3 - 1);
8019 if (!result)
8020 return PyErr_NoMemory();
8021 PyObject_Init(result, &EncodingMapType);
8022 mresult = (struct encoding_map*)result;
8023 mresult->count2 = count2;
8024 mresult->count3 = count3;
8025 mlevel1 = mresult->level1;
8026 mlevel2 = mresult->level23;
8027 mlevel3 = mresult->level23 + 16*count2;
8028 memcpy(mlevel1, level1, 32);
8029 memset(mlevel2, 0xFF, 16*count2);
8030 memset(mlevel3, 0, 128*count3);
8031 count3 = 0;
8032 for (i = 1; i < 256; i++) {
8033 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008034 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008035 /* unmapped character */
8036 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008037 o1 = PyUnicode_READ(kind, data, i)>>11;
8038 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008039 i2 = 16*mlevel1[o1] + o2;
8040 if (mlevel2[i2] == 0xFF)
8041 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008042 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008043 i3 = 128*mlevel2[i2] + o3;
8044 mlevel3[i3] = i;
8045 }
8046 return result;
8047}
8048
8049static int
Victor Stinner22168992011-11-20 17:09:18 +01008050encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008051{
8052 struct encoding_map *map = (struct encoding_map*)mapping;
8053 int l1 = c>>11;
8054 int l2 = (c>>7) & 0xF;
8055 int l3 = c & 0x7F;
8056 int i;
8057
Victor Stinner22168992011-11-20 17:09:18 +01008058 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008059 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008060 if (c == 0)
8061 return 0;
8062 /* level 1*/
8063 i = map->level1[l1];
8064 if (i == 0xFF) {
8065 return -1;
8066 }
8067 /* level 2*/
8068 i = map->level23[16*i+l2];
8069 if (i == 0xFF) {
8070 return -1;
8071 }
8072 /* level 3 */
8073 i = map->level23[16*map->count2 + 128*i + l3];
8074 if (i == 0) {
8075 return -1;
8076 }
8077 return i;
8078}
8079
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008080/* Lookup the character ch in the mapping. If the character
8081 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008082 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008083static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008084charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008085{
Christian Heimes217cfd12007-12-02 14:31:20 +00008086 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008087 PyObject *x;
8088
8089 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008090 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008091 x = PyObject_GetItem(mapping, w);
8092 Py_DECREF(w);
8093 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008094 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8095 /* No mapping found means: mapping is undefined. */
8096 PyErr_Clear();
8097 x = Py_None;
8098 Py_INCREF(x);
8099 return x;
8100 } else
8101 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008102 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008103 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008104 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008105 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008106 long value = PyLong_AS_LONG(x);
8107 if (value < 0 || value > 255) {
8108 PyErr_SetString(PyExc_TypeError,
8109 "character mapping must be in range(256)");
8110 Py_DECREF(x);
8111 return NULL;
8112 }
8113 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008114 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008115 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008116 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008117 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008118 /* wrong return value */
8119 PyErr_Format(PyExc_TypeError,
8120 "character mapping must return integer, bytes or None, not %.400s",
8121 x->ob_type->tp_name);
8122 Py_DECREF(x);
8123 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008124 }
8125}
8126
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008127static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008128charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008129{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008130 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8131 /* exponentially overallocate to minimize reallocations */
8132 if (requiredsize < 2*outsize)
8133 requiredsize = 2*outsize;
8134 if (_PyBytes_Resize(outobj, requiredsize))
8135 return -1;
8136 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008137}
8138
Benjamin Peterson14339b62009-01-31 16:36:08 +00008139typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008140 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008141} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008142/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008143 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008144 space is available. Return a new reference to the object that
8145 was put in the output buffer, or Py_None, if the mapping was undefined
8146 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008147 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008148static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008149charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008150 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008151{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008152 PyObject *rep;
8153 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008154 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008155
Christian Heimes90aa7642007-12-19 02:45:37 +00008156 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008157 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008158 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008159 if (res == -1)
8160 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008161 if (outsize<requiredsize)
8162 if (charmapencode_resize(outobj, outpos, requiredsize))
8163 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008164 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008165 outstart[(*outpos)++] = (char)res;
8166 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008167 }
8168
8169 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008170 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008171 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008172 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008173 Py_DECREF(rep);
8174 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008175 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008176 if (PyLong_Check(rep)) {
8177 Py_ssize_t requiredsize = *outpos+1;
8178 if (outsize<requiredsize)
8179 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8180 Py_DECREF(rep);
8181 return enc_EXCEPTION;
8182 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008183 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008184 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008185 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008186 else {
8187 const char *repchars = PyBytes_AS_STRING(rep);
8188 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8189 Py_ssize_t requiredsize = *outpos+repsize;
8190 if (outsize<requiredsize)
8191 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8192 Py_DECREF(rep);
8193 return enc_EXCEPTION;
8194 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008195 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008196 memcpy(outstart + *outpos, repchars, repsize);
8197 *outpos += repsize;
8198 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008199 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008200 Py_DECREF(rep);
8201 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008202}
8203
8204/* handle an error in PyUnicode_EncodeCharmap
8205 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008206static int
8207charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008208 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008209 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008210 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008211 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008212{
8213 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008214 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008215 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008216 enum PyUnicode_Kind kind;
8217 void *data;
8218 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008219 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008220 Py_ssize_t collstartpos = *inpos;
8221 Py_ssize_t collendpos = *inpos+1;
8222 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008223 char *encoding = "charmap";
8224 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008225 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008226 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008227 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008228
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008229 if (PyUnicode_READY(unicode) < 0)
8230 return -1;
8231 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008232 /* find all unencodable characters */
8233 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008234 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008235 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008236 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008237 val = encoding_map_lookup(ch, mapping);
8238 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008239 break;
8240 ++collendpos;
8241 continue;
8242 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008243
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008244 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8245 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008246 if (rep==NULL)
8247 return -1;
8248 else if (rep!=Py_None) {
8249 Py_DECREF(rep);
8250 break;
8251 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008252 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008253 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008254 }
8255 /* cache callback name lookup
8256 * (if not done yet, i.e. it's the first error) */
8257 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008258 if ((errors==NULL) || (!strcmp(errors, "strict")))
8259 *known_errorHandler = 1;
8260 else if (!strcmp(errors, "replace"))
8261 *known_errorHandler = 2;
8262 else if (!strcmp(errors, "ignore"))
8263 *known_errorHandler = 3;
8264 else if (!strcmp(errors, "xmlcharrefreplace"))
8265 *known_errorHandler = 4;
8266 else
8267 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008268 }
8269 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008270 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008271 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008272 return -1;
8273 case 2: /* replace */
8274 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008275 x = charmapencode_output('?', mapping, res, respos);
8276 if (x==enc_EXCEPTION) {
8277 return -1;
8278 }
8279 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008280 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008281 return -1;
8282 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008283 }
8284 /* fall through */
8285 case 3: /* ignore */
8286 *inpos = collendpos;
8287 break;
8288 case 4: /* xmlcharrefreplace */
8289 /* generate replacement (temporarily (mis)uses p) */
8290 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008291 char buffer[2+29+1+1];
8292 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008293 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008294 for (cp = buffer; *cp; ++cp) {
8295 x = charmapencode_output(*cp, mapping, res, respos);
8296 if (x==enc_EXCEPTION)
8297 return -1;
8298 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008299 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008300 return -1;
8301 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008302 }
8303 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008304 *inpos = collendpos;
8305 break;
8306 default:
8307 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008308 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008309 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008310 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008311 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008312 if (PyBytes_Check(repunicode)) {
8313 /* Directly copy bytes result to output. */
8314 Py_ssize_t outsize = PyBytes_Size(*res);
8315 Py_ssize_t requiredsize;
8316 repsize = PyBytes_Size(repunicode);
8317 requiredsize = *respos + repsize;
8318 if (requiredsize > outsize)
8319 /* Make room for all additional bytes. */
8320 if (charmapencode_resize(res, respos, requiredsize)) {
8321 Py_DECREF(repunicode);
8322 return -1;
8323 }
8324 memcpy(PyBytes_AsString(*res) + *respos,
8325 PyBytes_AsString(repunicode), repsize);
8326 *respos += repsize;
8327 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008328 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008329 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008330 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008331 /* generate replacement */
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008332 if (PyUnicode_READY(repunicode) < 0) {
8333 Py_DECREF(repunicode);
8334 return -1;
8335 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008336 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008337 data = PyUnicode_DATA(repunicode);
8338 kind = PyUnicode_KIND(repunicode);
8339 for (index = 0; index < repsize; index++) {
8340 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8341 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008342 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008343 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008344 return -1;
8345 }
8346 else if (x==enc_FAILED) {
8347 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008348 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008349 return -1;
8350 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008351 }
8352 *inpos = newpos;
8353 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008354 }
8355 return 0;
8356}
8357
Alexander Belopolsky40018472011-02-26 01:02:56 +00008358PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008359_PyUnicode_EncodeCharmap(PyObject *unicode,
8360 PyObject *mapping,
8361 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008362{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008363 /* output object */
8364 PyObject *res = NULL;
8365 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008366 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008367 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008368 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008369 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008370 PyObject *errorHandler = NULL;
8371 PyObject *exc = NULL;
8372 /* the following variable is used for caching string comparisons
8373 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8374 * 3=ignore, 4=xmlcharrefreplace */
8375 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008376
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008377 if (PyUnicode_READY(unicode) < 0)
8378 return NULL;
8379 size = PyUnicode_GET_LENGTH(unicode);
8380
Guido van Rossumd57fd912000-03-10 22:53:23 +00008381 /* Default to Latin-1 */
8382 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008383 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008384
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008385 /* allocate enough for a simple encoding without
8386 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008387 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008388 if (res == NULL)
8389 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008390 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008391 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008392
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008393 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008394 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008395 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008396 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008397 if (x==enc_EXCEPTION) /* error */
8398 goto onError;
8399 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008400 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008401 &exc,
8402 &known_errorHandler, &errorHandler, errors,
8403 &res, &respos)) {
8404 goto onError;
8405 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008406 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008407 else
8408 /* done with this character => adjust input position */
8409 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008410 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008411
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008412 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008413 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008414 if (_PyBytes_Resize(&res, respos) < 0)
8415 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008416
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008417 Py_XDECREF(exc);
8418 Py_XDECREF(errorHandler);
8419 return res;
8420
Benjamin Peterson29060642009-01-31 22:14:21 +00008421 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008422 Py_XDECREF(res);
8423 Py_XDECREF(exc);
8424 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008425 return NULL;
8426}
8427
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008428/* Deprecated */
8429PyObject *
8430PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8431 Py_ssize_t size,
8432 PyObject *mapping,
8433 const char *errors)
8434{
8435 PyObject *result;
8436 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8437 if (unicode == NULL)
8438 return NULL;
8439 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8440 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008441 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008442}
8443
Alexander Belopolsky40018472011-02-26 01:02:56 +00008444PyObject *
8445PyUnicode_AsCharmapString(PyObject *unicode,
8446 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008447{
8448 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008449 PyErr_BadArgument();
8450 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008451 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008452 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008453}
8454
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008455/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008456static void
8457make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008458 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008459 Py_ssize_t startpos, Py_ssize_t endpos,
8460 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008461{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008462 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008463 *exceptionObject = _PyUnicodeTranslateError_Create(
8464 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008465 }
8466 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008467 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8468 goto onError;
8469 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8470 goto onError;
8471 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8472 goto onError;
8473 return;
8474 onError:
8475 Py_DECREF(*exceptionObject);
8476 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008477 }
8478}
8479
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008480/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008481static void
8482raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008483 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008484 Py_ssize_t startpos, Py_ssize_t endpos,
8485 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008486{
8487 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008488 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008489 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008490 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008491}
8492
8493/* error handling callback helper:
8494 build arguments, call the callback and check the arguments,
8495 put the result into newpos and return the replacement string, which
8496 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008497static PyObject *
8498unicode_translate_call_errorhandler(const char *errors,
8499 PyObject **errorHandler,
8500 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008501 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008502 Py_ssize_t startpos, Py_ssize_t endpos,
8503 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008504{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008505 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008506
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008507 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008508 PyObject *restuple;
8509 PyObject *resunicode;
8510
8511 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008512 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008513 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008514 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008515 }
8516
8517 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008518 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008519 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008520 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008521
8522 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008523 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008524 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008525 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008526 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008527 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008528 Py_DECREF(restuple);
8529 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008530 }
8531 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008532 &resunicode, &i_newpos)) {
8533 Py_DECREF(restuple);
8534 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008535 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008536 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008537 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008538 else
8539 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008540 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008541 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8542 Py_DECREF(restuple);
8543 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008544 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008545 Py_INCREF(resunicode);
8546 Py_DECREF(restuple);
8547 return resunicode;
8548}
8549
8550/* Lookup the character ch in the mapping and put the result in result,
8551 which must be decrefed by the caller.
8552 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008553static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008554charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008555{
Christian Heimes217cfd12007-12-02 14:31:20 +00008556 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008557 PyObject *x;
8558
8559 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008560 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008561 x = PyObject_GetItem(mapping, w);
8562 Py_DECREF(w);
8563 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008564 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8565 /* No mapping found means: use 1:1 mapping. */
8566 PyErr_Clear();
8567 *result = NULL;
8568 return 0;
8569 } else
8570 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008571 }
8572 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008573 *result = x;
8574 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008575 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008576 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008577 long value = PyLong_AS_LONG(x);
8578 long max = PyUnicode_GetMax();
8579 if (value < 0 || value > max) {
8580 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008581 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008582 Py_DECREF(x);
8583 return -1;
8584 }
8585 *result = x;
8586 return 0;
8587 }
8588 else if (PyUnicode_Check(x)) {
8589 *result = x;
8590 return 0;
8591 }
8592 else {
8593 /* wrong return value */
8594 PyErr_SetString(PyExc_TypeError,
8595 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008596 Py_DECREF(x);
8597 return -1;
8598 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008599}
8600/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008601 if not reallocate and adjust various state variables.
8602 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008603static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008604charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008605 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008606{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008607 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00008608 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008609 /* exponentially overallocate to minimize reallocations */
8610 if (requiredsize < 2 * oldsize)
8611 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008612 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8613 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008614 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008615 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008616 }
8617 return 0;
8618}
8619/* lookup the character, put the result in the output string and adjust
8620 various state variables. Return a new reference to the object that
8621 was put in the output buffer in *result, or Py_None, if the mapping was
8622 undefined (in which case no character was written).
8623 The called must decref result.
8624 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008625static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008626charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8627 PyObject *mapping, Py_UCS4 **output,
8628 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008629 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008630{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008631 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8632 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008633 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008634 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008635 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008636 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008637 }
8638 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008639 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008640 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008641 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008642 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008643 }
8644 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008645 Py_ssize_t repsize;
8646 if (PyUnicode_READY(*res) == -1)
8647 return -1;
8648 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008649 if (repsize==1) {
8650 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008651 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008652 }
8653 else if (repsize!=0) {
8654 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008655 Py_ssize_t requiredsize = *opos +
8656 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008657 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008658 Py_ssize_t i;
8659 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008660 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008661 for(i = 0; i < repsize; i++)
8662 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008663 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008664 }
8665 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008666 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008667 return 0;
8668}
8669
Alexander Belopolsky40018472011-02-26 01:02:56 +00008670PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008671_PyUnicode_TranslateCharmap(PyObject *input,
8672 PyObject *mapping,
8673 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008674{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008675 /* input object */
8676 char *idata;
8677 Py_ssize_t size, i;
8678 int kind;
8679 /* output buffer */
8680 Py_UCS4 *output = NULL;
8681 Py_ssize_t osize;
8682 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008683 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008684 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008685 char *reason = "character maps to <undefined>";
8686 PyObject *errorHandler = NULL;
8687 PyObject *exc = NULL;
8688 /* the following variable is used for caching string comparisons
8689 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8690 * 3=ignore, 4=xmlcharrefreplace */
8691 int known_errorHandler = -1;
8692
Guido van Rossumd57fd912000-03-10 22:53:23 +00008693 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008694 PyErr_BadArgument();
8695 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008696 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008697
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008698 if (PyUnicode_READY(input) == -1)
8699 return NULL;
8700 idata = (char*)PyUnicode_DATA(input);
8701 kind = PyUnicode_KIND(input);
8702 size = PyUnicode_GET_LENGTH(input);
8703 i = 0;
8704
8705 if (size == 0) {
8706 Py_INCREF(input);
8707 return input;
8708 }
8709
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008710 /* allocate enough for a simple 1:1 translation without
8711 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008712 osize = size;
8713 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8714 opos = 0;
8715 if (output == NULL) {
8716 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008717 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008718 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008719
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008720 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008721 /* try to encode it */
8722 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008723 if (charmaptranslate_output(input, i, mapping,
8724 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008725 Py_XDECREF(x);
8726 goto onError;
8727 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008728 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008729 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008730 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008731 else { /* untranslatable character */
8732 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8733 Py_ssize_t repsize;
8734 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008735 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008736 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008737 Py_ssize_t collstart = i;
8738 Py_ssize_t collend = i+1;
8739 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008740
Benjamin Peterson29060642009-01-31 22:14:21 +00008741 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008742 while (collend < size) {
8743 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008744 goto onError;
8745 Py_XDECREF(x);
8746 if (x!=Py_None)
8747 break;
8748 ++collend;
8749 }
8750 /* cache callback name lookup
8751 * (if not done yet, i.e. it's the first error) */
8752 if (known_errorHandler==-1) {
8753 if ((errors==NULL) || (!strcmp(errors, "strict")))
8754 known_errorHandler = 1;
8755 else if (!strcmp(errors, "replace"))
8756 known_errorHandler = 2;
8757 else if (!strcmp(errors, "ignore"))
8758 known_errorHandler = 3;
8759 else if (!strcmp(errors, "xmlcharrefreplace"))
8760 known_errorHandler = 4;
8761 else
8762 known_errorHandler = 0;
8763 }
8764 switch (known_errorHandler) {
8765 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008766 raise_translate_exception(&exc, input, collstart,
8767 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008768 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008769 case 2: /* replace */
8770 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008771 for (coll = collstart; coll<collend; coll++)
8772 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008773 /* fall through */
8774 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008775 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008776 break;
8777 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008778 /* generate replacement (temporarily (mis)uses i) */
8779 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008780 char buffer[2+29+1+1];
8781 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008782 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8783 if (charmaptranslate_makespace(&output, &osize,
8784 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008785 goto onError;
8786 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008787 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008788 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008789 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008790 break;
8791 default:
8792 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008793 reason, input, &exc,
8794 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008795 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008796 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008797 if (PyUnicode_READY(repunicode) < 0) {
8798 Py_DECREF(repunicode);
8799 goto onError;
8800 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008801 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008802 repsize = PyUnicode_GET_LENGTH(repunicode);
8803 if (charmaptranslate_makespace(&output, &osize,
8804 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008805 Py_DECREF(repunicode);
8806 goto onError;
8807 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008808 for (uni2 = 0; repsize-->0; ++uni2)
8809 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8810 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008811 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008812 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008813 }
8814 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008815 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8816 if (!res)
8817 goto onError;
8818 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008819 Py_XDECREF(exc);
8820 Py_XDECREF(errorHandler);
8821 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008822
Benjamin Peterson29060642009-01-31 22:14:21 +00008823 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008824 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008825 Py_XDECREF(exc);
8826 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008827 return NULL;
8828}
8829
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008830/* Deprecated. Use PyUnicode_Translate instead. */
8831PyObject *
8832PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8833 Py_ssize_t size,
8834 PyObject *mapping,
8835 const char *errors)
8836{
8837 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8838 if (!unicode)
8839 return NULL;
8840 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8841}
8842
Alexander Belopolsky40018472011-02-26 01:02:56 +00008843PyObject *
8844PyUnicode_Translate(PyObject *str,
8845 PyObject *mapping,
8846 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008847{
8848 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008849
Guido van Rossumd57fd912000-03-10 22:53:23 +00008850 str = PyUnicode_FromObject(str);
8851 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008852 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008853 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008854 Py_DECREF(str);
8855 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008856
Benjamin Peterson29060642009-01-31 22:14:21 +00008857 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008858 Py_XDECREF(str);
8859 return NULL;
8860}
Tim Petersced69f82003-09-16 20:30:58 +00008861
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008862static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008863fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008864{
8865 /* No need to call PyUnicode_READY(self) because this function is only
8866 called as a callback from fixup() which does it already. */
8867 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8868 const int kind = PyUnicode_KIND(self);
8869 void *data = PyUnicode_DATA(self);
8870 Py_UCS4 maxchar = 0, ch, fixed;
8871 Py_ssize_t i;
8872
8873 for (i = 0; i < len; ++i) {
8874 ch = PyUnicode_READ(kind, data, i);
8875 fixed = 0;
8876 if (ch > 127) {
8877 if (Py_UNICODE_ISSPACE(ch))
8878 fixed = ' ';
8879 else {
8880 const int decimal = Py_UNICODE_TODECIMAL(ch);
8881 if (decimal >= 0)
8882 fixed = '0' + decimal;
8883 }
8884 if (fixed != 0) {
8885 if (fixed > maxchar)
8886 maxchar = fixed;
8887 PyUnicode_WRITE(kind, data, i, fixed);
8888 }
8889 else if (ch > maxchar)
8890 maxchar = ch;
8891 }
8892 else if (ch > maxchar)
8893 maxchar = ch;
8894 }
8895
8896 return maxchar;
8897}
8898
8899PyObject *
8900_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8901{
8902 if (!PyUnicode_Check(unicode)) {
8903 PyErr_BadInternalCall();
8904 return NULL;
8905 }
8906 if (PyUnicode_READY(unicode) == -1)
8907 return NULL;
8908 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8909 /* If the string is already ASCII, just return the same string */
8910 Py_INCREF(unicode);
8911 return unicode;
8912 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008913 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008914}
8915
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008916PyObject *
8917PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8918 Py_ssize_t length)
8919{
Victor Stinnerf0124502011-11-21 23:12:56 +01008920 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008921 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008922 Py_UCS4 maxchar;
8923 enum PyUnicode_Kind kind;
8924 void *data;
8925
8926 maxchar = 0;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008927 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008928 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008929 if (ch > 127) {
8930 int decimal = Py_UNICODE_TODECIMAL(ch);
8931 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008932 ch = '0' + decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008933 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008934 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008935 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008936
8937 /* Copy to a new string */
8938 decimal = PyUnicode_New(length, maxchar);
8939 if (decimal == NULL)
8940 return decimal;
8941 kind = PyUnicode_KIND(decimal);
8942 data = PyUnicode_DATA(decimal);
8943 /* Iterate over code points */
8944 for (i = 0; i < length; i++) {
8945 Py_UNICODE ch = s[i];
8946 if (ch > 127) {
8947 int decimal = Py_UNICODE_TODECIMAL(ch);
8948 if (decimal >= 0)
8949 ch = '0' + decimal;
8950 }
8951 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008952 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008953 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008954}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008955/* --- Decimal Encoder ---------------------------------------------------- */
8956
Alexander Belopolsky40018472011-02-26 01:02:56 +00008957int
8958PyUnicode_EncodeDecimal(Py_UNICODE *s,
8959 Py_ssize_t length,
8960 char *output,
8961 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008962{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008963 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008964 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008965 enum PyUnicode_Kind kind;
8966 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008967
8968 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008969 PyErr_BadArgument();
8970 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008971 }
8972
Victor Stinner42bf7752011-11-21 22:52:58 +01008973 unicode = PyUnicode_FromUnicode(s, length);
8974 if (unicode == NULL)
8975 return -1;
8976
Victor Stinner6345be92011-11-25 20:09:01 +01008977 if (PyUnicode_READY(unicode) < 0) {
8978 Py_DECREF(unicode);
8979 return -1;
8980 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008981 kind = PyUnicode_KIND(unicode);
8982 data = PyUnicode_DATA(unicode);
8983
Victor Stinnerb84d7232011-11-22 01:50:07 +01008984 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008985 PyObject *exc;
8986 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008987 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008988 Py_ssize_t startpos;
8989
8990 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008991
Benjamin Peterson29060642009-01-31 22:14:21 +00008992 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008993 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008994 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008995 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008996 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008997 decimal = Py_UNICODE_TODECIMAL(ch);
8998 if (decimal >= 0) {
8999 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009000 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009001 continue;
9002 }
9003 if (0 < ch && ch < 256) {
9004 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009005 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009006 continue;
9007 }
Victor Stinner6345be92011-11-25 20:09:01 +01009008
Victor Stinner42bf7752011-11-21 22:52:58 +01009009 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009010 exc = NULL;
9011 raise_encode_exception(&exc, "decimal", unicode,
9012 startpos, startpos+1,
9013 "invalid decimal Unicode string");
9014 Py_XDECREF(exc);
9015 Py_DECREF(unicode);
9016 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009017 }
9018 /* 0-terminate the output string */
9019 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009020 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009021 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009022}
9023
Guido van Rossumd57fd912000-03-10 22:53:23 +00009024/* --- Helpers ------------------------------------------------------------ */
9025
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009026static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02009027any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009028 Py_ssize_t start,
9029 Py_ssize_t end)
9030{
9031 int kind1, kind2, kind;
9032 void *buf1, *buf2;
9033 Py_ssize_t len1, len2, result;
9034
9035 kind1 = PyUnicode_KIND(s1);
9036 kind2 = PyUnicode_KIND(s2);
9037 kind = kind1 > kind2 ? kind1 : kind2;
9038 buf1 = PyUnicode_DATA(s1);
9039 buf2 = PyUnicode_DATA(s2);
9040 if (kind1 != kind)
9041 buf1 = _PyUnicode_AsKind(s1, kind);
9042 if (!buf1)
9043 return -2;
9044 if (kind2 != kind)
9045 buf2 = _PyUnicode_AsKind(s2, kind);
9046 if (!buf2) {
9047 if (kind1 != kind) PyMem_Free(buf1);
9048 return -2;
9049 }
9050 len1 = PyUnicode_GET_LENGTH(s1);
9051 len2 = PyUnicode_GET_LENGTH(s2);
9052
Victor Stinner794d5672011-10-10 03:21:36 +02009053 if (direction > 0) {
9054 switch(kind) {
9055 case PyUnicode_1BYTE_KIND:
9056 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9057 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9058 else
9059 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9060 break;
9061 case PyUnicode_2BYTE_KIND:
9062 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9063 break;
9064 case PyUnicode_4BYTE_KIND:
9065 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9066 break;
9067 default:
9068 assert(0); result = -2;
9069 }
9070 }
9071 else {
9072 switch(kind) {
9073 case PyUnicode_1BYTE_KIND:
9074 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9075 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9076 else
9077 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9078 break;
9079 case PyUnicode_2BYTE_KIND:
9080 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9081 break;
9082 case PyUnicode_4BYTE_KIND:
9083 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9084 break;
9085 default:
9086 assert(0); result = -2;
9087 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009088 }
9089
9090 if (kind1 != kind)
9091 PyMem_Free(buf1);
9092 if (kind2 != kind)
9093 PyMem_Free(buf2);
9094
9095 return result;
9096}
9097
9098Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009099_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009100 Py_ssize_t n_buffer,
9101 void *digits, Py_ssize_t n_digits,
9102 Py_ssize_t min_width,
9103 const char *grouping,
9104 const char *thousands_sep)
9105{
9106 switch(kind) {
9107 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009108 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
9109 return _PyUnicode_ascii_InsertThousandsGrouping(
9110 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9111 min_width, grouping, thousands_sep);
9112 else
9113 return _PyUnicode_ucs1_InsertThousandsGrouping(
9114 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9115 min_width, grouping, thousands_sep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009116 case PyUnicode_2BYTE_KIND:
9117 return _PyUnicode_ucs2_InsertThousandsGrouping(
9118 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
9119 min_width, grouping, thousands_sep);
9120 case PyUnicode_4BYTE_KIND:
9121 return _PyUnicode_ucs4_InsertThousandsGrouping(
9122 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
9123 min_width, grouping, thousands_sep);
9124 }
9125 assert(0);
9126 return -1;
9127}
9128
9129
Thomas Wouters477c8d52006-05-27 19:21:47 +00009130/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009131#define ADJUST_INDICES(start, end, len) \
9132 if (end > len) \
9133 end = len; \
9134 else if (end < 0) { \
9135 end += len; \
9136 if (end < 0) \
9137 end = 0; \
9138 } \
9139 if (start < 0) { \
9140 start += len; \
9141 if (start < 0) \
9142 start = 0; \
9143 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009144
Alexander Belopolsky40018472011-02-26 01:02:56 +00009145Py_ssize_t
9146PyUnicode_Count(PyObject *str,
9147 PyObject *substr,
9148 Py_ssize_t start,
9149 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009150{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009151 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009152 PyObject* str_obj;
9153 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009154 int kind1, kind2, kind;
9155 void *buf1 = NULL, *buf2 = NULL;
9156 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009157
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009158 str_obj = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009159 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009160 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009161 sub_obj = PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02009162 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009163 Py_DECREF(str_obj);
9164 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009165 }
Tim Petersced69f82003-09-16 20:30:58 +00009166
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009167 kind1 = PyUnicode_KIND(str_obj);
9168 kind2 = PyUnicode_KIND(sub_obj);
9169 kind = kind1 > kind2 ? kind1 : kind2;
9170 buf1 = PyUnicode_DATA(str_obj);
9171 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009172 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009173 if (!buf1)
9174 goto onError;
9175 buf2 = PyUnicode_DATA(sub_obj);
9176 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009177 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009178 if (!buf2)
9179 goto onError;
9180 len1 = PyUnicode_GET_LENGTH(str_obj);
9181 len2 = PyUnicode_GET_LENGTH(sub_obj);
9182
9183 ADJUST_INDICES(start, end, len1);
9184 switch(kind) {
9185 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009186 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9187 result = asciilib_count(
9188 ((Py_UCS1*)buf1) + start, end - start,
9189 buf2, len2, PY_SSIZE_T_MAX
9190 );
9191 else
9192 result = ucs1lib_count(
9193 ((Py_UCS1*)buf1) + start, end - start,
9194 buf2, len2, PY_SSIZE_T_MAX
9195 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009196 break;
9197 case PyUnicode_2BYTE_KIND:
9198 result = ucs2lib_count(
9199 ((Py_UCS2*)buf1) + start, end - start,
9200 buf2, len2, PY_SSIZE_T_MAX
9201 );
9202 break;
9203 case PyUnicode_4BYTE_KIND:
9204 result = ucs4lib_count(
9205 ((Py_UCS4*)buf1) + start, end - start,
9206 buf2, len2, PY_SSIZE_T_MAX
9207 );
9208 break;
9209 default:
9210 assert(0); result = 0;
9211 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009212
9213 Py_DECREF(sub_obj);
9214 Py_DECREF(str_obj);
9215
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009216 if (kind1 != kind)
9217 PyMem_Free(buf1);
9218 if (kind2 != kind)
9219 PyMem_Free(buf2);
9220
Guido van Rossumd57fd912000-03-10 22:53:23 +00009221 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009222 onError:
9223 Py_DECREF(sub_obj);
9224 Py_DECREF(str_obj);
9225 if (kind1 != kind && buf1)
9226 PyMem_Free(buf1);
9227 if (kind2 != kind && buf2)
9228 PyMem_Free(buf2);
9229 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009230}
9231
Alexander Belopolsky40018472011-02-26 01:02:56 +00009232Py_ssize_t
9233PyUnicode_Find(PyObject *str,
9234 PyObject *sub,
9235 Py_ssize_t start,
9236 Py_ssize_t end,
9237 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009238{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009239 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009240
Guido van Rossumd57fd912000-03-10 22:53:23 +00009241 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009242 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009243 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009244 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009245 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009246 Py_DECREF(str);
9247 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009248 }
Tim Petersced69f82003-09-16 20:30:58 +00009249
Victor Stinner794d5672011-10-10 03:21:36 +02009250 result = any_find_slice(direction,
9251 str, sub, start, end
9252 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009253
Guido van Rossumd57fd912000-03-10 22:53:23 +00009254 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009255 Py_DECREF(sub);
9256
Guido van Rossumd57fd912000-03-10 22:53:23 +00009257 return result;
9258}
9259
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009260Py_ssize_t
9261PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9262 Py_ssize_t start, Py_ssize_t end,
9263 int direction)
9264{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009265 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009266 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009267 if (PyUnicode_READY(str) == -1)
9268 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009269 if (start < 0 || end < 0) {
9270 PyErr_SetString(PyExc_IndexError, "string index out of range");
9271 return -2;
9272 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009273 if (end > PyUnicode_GET_LENGTH(str))
9274 end = PyUnicode_GET_LENGTH(str);
9275 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009276 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9277 kind, end-start, ch, direction);
9278 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009279 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009280 else
9281 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009282}
9283
Alexander Belopolsky40018472011-02-26 01:02:56 +00009284static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009285tailmatch(PyObject *self,
9286 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009287 Py_ssize_t start,
9288 Py_ssize_t end,
9289 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009290{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009291 int kind_self;
9292 int kind_sub;
9293 void *data_self;
9294 void *data_sub;
9295 Py_ssize_t offset;
9296 Py_ssize_t i;
9297 Py_ssize_t end_sub;
9298
9299 if (PyUnicode_READY(self) == -1 ||
9300 PyUnicode_READY(substring) == -1)
9301 return 0;
9302
9303 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009304 return 1;
9305
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009306 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9307 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009308 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009309 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009310
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009311 kind_self = PyUnicode_KIND(self);
9312 data_self = PyUnicode_DATA(self);
9313 kind_sub = PyUnicode_KIND(substring);
9314 data_sub = PyUnicode_DATA(substring);
9315 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9316
9317 if (direction > 0)
9318 offset = end;
9319 else
9320 offset = start;
9321
9322 if (PyUnicode_READ(kind_self, data_self, offset) ==
9323 PyUnicode_READ(kind_sub, data_sub, 0) &&
9324 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9325 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9326 /* If both are of the same kind, memcmp is sufficient */
9327 if (kind_self == kind_sub) {
9328 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009329 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009330 data_sub,
9331 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009332 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009333 }
9334 /* otherwise we have to compare each character by first accesing it */
9335 else {
9336 /* We do not need to compare 0 and len(substring)-1 because
9337 the if statement above ensured already that they are equal
9338 when we end up here. */
9339 // TODO: honor direction and do a forward or backwards search
9340 for (i = 1; i < end_sub; ++i) {
9341 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9342 PyUnicode_READ(kind_sub, data_sub, i))
9343 return 0;
9344 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009345 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009346 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009347 }
9348
9349 return 0;
9350}
9351
Alexander Belopolsky40018472011-02-26 01:02:56 +00009352Py_ssize_t
9353PyUnicode_Tailmatch(PyObject *str,
9354 PyObject *substr,
9355 Py_ssize_t start,
9356 Py_ssize_t end,
9357 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009358{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009359 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009360
Guido van Rossumd57fd912000-03-10 22:53:23 +00009361 str = PyUnicode_FromObject(str);
9362 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009363 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009364 substr = PyUnicode_FromObject(substr);
9365 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009366 Py_DECREF(str);
9367 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009368 }
Tim Petersced69f82003-09-16 20:30:58 +00009369
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009370 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009371 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009372 Py_DECREF(str);
9373 Py_DECREF(substr);
9374 return result;
9375}
9376
Guido van Rossumd57fd912000-03-10 22:53:23 +00009377/* Apply fixfct filter to the Unicode object self and return a
9378 reference to the modified object */
9379
Alexander Belopolsky40018472011-02-26 01:02:56 +00009380static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009381fixup(PyObject *self,
9382 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009383{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009384 PyObject *u;
9385 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009386 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009387
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009388 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009389 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009390 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009391 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009392
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009393 /* fix functions return the new maximum character in a string,
9394 if the kind of the resulting unicode object does not change,
9395 everything is fine. Otherwise we need to change the string kind
9396 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009397 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009398
9399 if (maxchar_new == 0) {
9400 /* no changes */;
9401 if (PyUnicode_CheckExact(self)) {
9402 Py_DECREF(u);
9403 Py_INCREF(self);
9404 return self;
9405 }
9406 else
9407 return u;
9408 }
9409
9410 if (maxchar_new <= 127)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009411 maxchar_new = 127;
9412 else if (maxchar_new <= 255)
9413 maxchar_new = 255;
9414 else if (maxchar_new <= 65535)
9415 maxchar_new = 65535;
9416 else
Victor Stinner8faf8212011-12-08 22:14:11 +01009417 maxchar_new = MAX_UNICODE;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009418
Victor Stinnereaab6042011-12-11 22:22:39 +01009419 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009420 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009421
9422 /* In case the maximum character changed, we need to
9423 convert the string to the new category. */
9424 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9425 if (v == NULL) {
9426 Py_DECREF(u);
9427 return NULL;
9428 }
9429 if (maxchar_new > maxchar_old) {
9430 /* If the maxchar increased so that the kind changed, not all
9431 characters are representable anymore and we need to fix the
9432 string again. This only happens in very few cases. */
9433 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
9434 maxchar_old = fixfct(v);
9435 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009436 }
9437 else {
Victor Stinnereaab6042011-12-11 22:22:39 +01009438 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009439 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009440 Py_DECREF(u);
9441 assert(_PyUnicode_CheckConsistency(v, 1));
9442 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009443}
9444
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009445static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009446fixupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009447{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009448 /* No need to call PyUnicode_READY(self) because this function is only
9449 called as a callback from fixup() which does it already. */
9450 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9451 const int kind = PyUnicode_KIND(self);
9452 void *data = PyUnicode_DATA(self);
9453 int touched = 0;
9454 Py_UCS4 maxchar = 0;
9455 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009456
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009457 for (i = 0; i < len; ++i) {
9458 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9459 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
9460 if (up != ch) {
9461 if (up > maxchar)
9462 maxchar = up;
9463 PyUnicode_WRITE(kind, data, i, up);
9464 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009465 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009466 else if (ch > maxchar)
9467 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009468 }
9469
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009470 if (touched)
9471 return maxchar;
9472 else
9473 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009474}
9475
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009476static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009477fixlower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009478{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009479 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9480 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9481 const int kind = PyUnicode_KIND(self);
9482 void *data = PyUnicode_DATA(self);
9483 int touched = 0;
9484 Py_UCS4 maxchar = 0;
9485 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009486
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009487 for(i = 0; i < len; ++i) {
9488 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9489 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9490 if (lo != ch) {
9491 if (lo > maxchar)
9492 maxchar = lo;
9493 PyUnicode_WRITE(kind, data, i, lo);
9494 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009495 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009496 else if (ch > maxchar)
9497 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009498 }
9499
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009500 if (touched)
9501 return maxchar;
9502 else
9503 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009504}
9505
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009506static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009507fixswapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009508{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009509 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9510 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9511 const int kind = PyUnicode_KIND(self);
9512 void *data = PyUnicode_DATA(self);
9513 int touched = 0;
9514 Py_UCS4 maxchar = 0;
9515 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009516
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009517 for(i = 0; i < len; ++i) {
9518 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9519 Py_UCS4 nu = 0;
9520
9521 if (Py_UNICODE_ISUPPER(ch))
9522 nu = Py_UNICODE_TOLOWER(ch);
9523 else if (Py_UNICODE_ISLOWER(ch))
9524 nu = Py_UNICODE_TOUPPER(ch);
9525
9526 if (nu != 0) {
9527 if (nu > maxchar)
9528 maxchar = nu;
9529 PyUnicode_WRITE(kind, data, i, nu);
9530 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009531 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009532 else if (ch > maxchar)
9533 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009534 }
9535
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009536 if (touched)
9537 return maxchar;
9538 else
9539 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009540}
9541
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009542static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009543fixcapitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009544{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009545 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9546 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9547 const int kind = PyUnicode_KIND(self);
9548 void *data = PyUnicode_DATA(self);
9549 int touched = 0;
9550 Py_UCS4 maxchar = 0;
9551 Py_ssize_t i = 0;
9552 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00009553
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009554 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009555 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009556
9557 ch = PyUnicode_READ(kind, data, i);
9558 if (!Py_UNICODE_ISUPPER(ch)) {
9559 maxchar = Py_UNICODE_TOUPPER(ch);
9560 PyUnicode_WRITE(kind, data, i, maxchar);
9561 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009562 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009563 ++i;
9564 for(; i < len; ++i) {
9565 ch = PyUnicode_READ(kind, data, i);
9566 if (!Py_UNICODE_ISLOWER(ch)) {
9567 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9568 if (lo > maxchar)
9569 maxchar = lo;
9570 PyUnicode_WRITE(kind, data, i, lo);
9571 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009572 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009573 else if (ch > maxchar)
9574 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009575 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009576
9577 if (touched)
9578 return maxchar;
9579 else
9580 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009581}
9582
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009583static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009584fixtitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009585{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009586 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9587 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9588 const int kind = PyUnicode_KIND(self);
9589 void *data = PyUnicode_DATA(self);
9590 Py_UCS4 maxchar = 0;
9591 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009592 int previous_is_cased;
9593
9594 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009595 if (len == 1) {
9596 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9597 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
9598 if (ti != ch) {
9599 PyUnicode_WRITE(kind, data, i, ti);
9600 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00009601 }
9602 else
9603 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009604 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009605 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009606 for(; i < len; ++i) {
9607 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9608 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00009609
Benjamin Peterson29060642009-01-31 22:14:21 +00009610 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009611 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00009612 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009613 nu = Py_UNICODE_TOTITLE(ch);
9614
9615 if (nu > maxchar)
9616 maxchar = nu;
9617 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00009618
Benjamin Peterson29060642009-01-31 22:14:21 +00009619 if (Py_UNICODE_ISLOWER(ch) ||
9620 Py_UNICODE_ISUPPER(ch) ||
9621 Py_UNICODE_ISTITLE(ch))
9622 previous_is_cased = 1;
9623 else
9624 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009625 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009626 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009627}
9628
Tim Peters8ce9f162004-08-27 01:49:32 +00009629PyObject *
9630PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009631{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009632 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009633 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009634 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009635 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009636 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9637 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009638 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009639 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009640 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009641 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009642 int use_memcpy;
9643 unsigned char *res_data = NULL, *sep_data = NULL;
9644 PyObject *last_obj;
9645 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009646
Tim Peters05eba1f2004-08-27 21:32:02 +00009647 fseq = PySequence_Fast(seq, "");
9648 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009649 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009650 }
9651
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009652 /* NOTE: the following code can't call back into Python code,
9653 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009654 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009655
Tim Peters05eba1f2004-08-27 21:32:02 +00009656 seqlen = PySequence_Fast_GET_SIZE(fseq);
9657 /* If empty sequence, return u"". */
9658 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009659 Py_DECREF(fseq);
9660 Py_INCREF(unicode_empty);
9661 res = unicode_empty;
9662 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009663 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009664
Tim Peters05eba1f2004-08-27 21:32:02 +00009665 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009666 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009667 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009668 if (seqlen == 1) {
9669 if (PyUnicode_CheckExact(items[0])) {
9670 res = items[0];
9671 Py_INCREF(res);
9672 Py_DECREF(fseq);
9673 return res;
9674 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009675 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009676 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009677 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009678 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009679 /* Set up sep and seplen */
9680 if (separator == NULL) {
9681 /* fall back to a blank space separator */
9682 sep = PyUnicode_FromOrdinal(' ');
9683 if (!sep)
9684 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009685 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009686 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009687 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009688 else {
9689 if (!PyUnicode_Check(separator)) {
9690 PyErr_Format(PyExc_TypeError,
9691 "separator: expected str instance,"
9692 " %.80s found",
9693 Py_TYPE(separator)->tp_name);
9694 goto onError;
9695 }
9696 if (PyUnicode_READY(separator))
9697 goto onError;
9698 sep = separator;
9699 seplen = PyUnicode_GET_LENGTH(separator);
9700 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9701 /* inc refcount to keep this code path symmetric with the
9702 above case of a blank separator */
9703 Py_INCREF(sep);
9704 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009705 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009706 }
9707
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009708 /* There are at least two things to join, or else we have a subclass
9709 * of str in the sequence.
9710 * Do a pre-pass to figure out the total amount of space we'll
9711 * need (sz), and see whether all argument are strings.
9712 */
9713 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009714#ifdef Py_DEBUG
9715 use_memcpy = 0;
9716#else
9717 use_memcpy = 1;
9718#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009719 for (i = 0; i < seqlen; i++) {
9720 const Py_ssize_t old_sz = sz;
9721 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009722 if (!PyUnicode_Check(item)) {
9723 PyErr_Format(PyExc_TypeError,
9724 "sequence item %zd: expected str instance,"
9725 " %.80s found",
9726 i, Py_TYPE(item)->tp_name);
9727 goto onError;
9728 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009729 if (PyUnicode_READY(item) == -1)
9730 goto onError;
9731 sz += PyUnicode_GET_LENGTH(item);
9732 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009733 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009734 if (i != 0)
9735 sz += seplen;
9736 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9737 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009738 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009739 goto onError;
9740 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009741 if (use_memcpy && last_obj != NULL) {
9742 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9743 use_memcpy = 0;
9744 }
9745 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009746 }
Tim Petersced69f82003-09-16 20:30:58 +00009747
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009748 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009749 if (res == NULL)
9750 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009751
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009752 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009753#ifdef Py_DEBUG
9754 use_memcpy = 0;
9755#else
9756 if (use_memcpy) {
9757 res_data = PyUnicode_1BYTE_DATA(res);
9758 kind = PyUnicode_KIND(res);
9759 if (seplen != 0)
9760 sep_data = PyUnicode_1BYTE_DATA(sep);
9761 }
9762#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009763 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009764 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009765 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009766 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009767 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009768 if (use_memcpy) {
9769 Py_MEMCPY(res_data,
9770 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009771 kind * seplen);
9772 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009773 }
9774 else {
9775 copy_characters(res, res_offset, sep, 0, seplen);
9776 res_offset += seplen;
9777 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009778 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009779 itemlen = PyUnicode_GET_LENGTH(item);
9780 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009781 if (use_memcpy) {
9782 Py_MEMCPY(res_data,
9783 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009784 kind * itemlen);
9785 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009786 }
9787 else {
9788 copy_characters(res, res_offset, item, 0, itemlen);
9789 res_offset += itemlen;
9790 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009791 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009792 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009793 if (use_memcpy)
9794 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009795 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009796 else
9797 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009798
Tim Peters05eba1f2004-08-27 21:32:02 +00009799 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009800 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009801 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009802 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009803
Benjamin Peterson29060642009-01-31 22:14:21 +00009804 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009805 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009806 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009807 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009808 return NULL;
9809}
9810
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009811#define FILL(kind, data, value, start, length) \
9812 do { \
9813 Py_ssize_t i_ = 0; \
9814 assert(kind != PyUnicode_WCHAR_KIND); \
9815 switch ((kind)) { \
9816 case PyUnicode_1BYTE_KIND: { \
9817 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9818 memset(to_, (unsigned char)value, length); \
9819 break; \
9820 } \
9821 case PyUnicode_2BYTE_KIND: { \
9822 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9823 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9824 break; \
9825 } \
9826 default: { \
9827 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9828 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9829 break; \
9830 } \
9831 } \
9832 } while (0)
9833
Victor Stinner9310abb2011-10-05 00:59:23 +02009834static PyObject *
9835pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009836 Py_ssize_t left,
9837 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009838 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009839{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009840 PyObject *u;
9841 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009842 int kind;
9843 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009844
9845 if (left < 0)
9846 left = 0;
9847 if (right < 0)
9848 right = 0;
9849
Victor Stinnerc4b49542011-12-11 22:44:26 +01009850 if (left == 0 && right == 0)
9851 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009852
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009853 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9854 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009855 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9856 return NULL;
9857 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009858 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9859 if (fill > maxchar)
9860 maxchar = fill;
9861 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009862 if (!u)
9863 return NULL;
9864
9865 kind = PyUnicode_KIND(u);
9866 data = PyUnicode_DATA(u);
9867 if (left)
9868 FILL(kind, data, fill, 0, left);
9869 if (right)
9870 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009871 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009872 assert(_PyUnicode_CheckConsistency(u, 1));
9873 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009874}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009875#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009876
Alexander Belopolsky40018472011-02-26 01:02:56 +00009877PyObject *
9878PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009879{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009880 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009881
9882 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009883 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009884 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009885
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009886 switch(PyUnicode_KIND(string)) {
9887 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009888 if (PyUnicode_IS_ASCII(string))
9889 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009890 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009891 PyUnicode_GET_LENGTH(string), keepends);
9892 else
9893 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009894 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009895 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009896 break;
9897 case PyUnicode_2BYTE_KIND:
9898 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009899 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009900 PyUnicode_GET_LENGTH(string), keepends);
9901 break;
9902 case PyUnicode_4BYTE_KIND:
9903 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009904 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009905 PyUnicode_GET_LENGTH(string), keepends);
9906 break;
9907 default:
9908 assert(0);
9909 list = 0;
9910 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009911 Py_DECREF(string);
9912 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009913}
9914
Alexander Belopolsky40018472011-02-26 01:02:56 +00009915static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009916split(PyObject *self,
9917 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009918 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009919{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009920 int kind1, kind2, kind;
9921 void *buf1, *buf2;
9922 Py_ssize_t len1, len2;
9923 PyObject* out;
9924
Guido van Rossumd57fd912000-03-10 22:53:23 +00009925 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009926 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009927
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009928 if (PyUnicode_READY(self) == -1)
9929 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009930
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009931 if (substring == NULL)
9932 switch(PyUnicode_KIND(self)) {
9933 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009934 if (PyUnicode_IS_ASCII(self))
9935 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009936 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009937 PyUnicode_GET_LENGTH(self), maxcount
9938 );
9939 else
9940 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009941 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009942 PyUnicode_GET_LENGTH(self), maxcount
9943 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009944 case PyUnicode_2BYTE_KIND:
9945 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009946 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009947 PyUnicode_GET_LENGTH(self), maxcount
9948 );
9949 case PyUnicode_4BYTE_KIND:
9950 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009951 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009952 PyUnicode_GET_LENGTH(self), maxcount
9953 );
9954 default:
9955 assert(0);
9956 return NULL;
9957 }
9958
9959 if (PyUnicode_READY(substring) == -1)
9960 return NULL;
9961
9962 kind1 = PyUnicode_KIND(self);
9963 kind2 = PyUnicode_KIND(substring);
9964 kind = kind1 > kind2 ? kind1 : kind2;
9965 buf1 = PyUnicode_DATA(self);
9966 buf2 = PyUnicode_DATA(substring);
9967 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009968 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009969 if (!buf1)
9970 return NULL;
9971 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009972 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009973 if (!buf2) {
9974 if (kind1 != kind) PyMem_Free(buf1);
9975 return NULL;
9976 }
9977 len1 = PyUnicode_GET_LENGTH(self);
9978 len2 = PyUnicode_GET_LENGTH(substring);
9979
9980 switch(kind) {
9981 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009982 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9983 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009984 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009985 else
9986 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009987 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009988 break;
9989 case PyUnicode_2BYTE_KIND:
9990 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009991 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009992 break;
9993 case PyUnicode_4BYTE_KIND:
9994 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009995 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009996 break;
9997 default:
9998 out = NULL;
9999 }
10000 if (kind1 != kind)
10001 PyMem_Free(buf1);
10002 if (kind2 != kind)
10003 PyMem_Free(buf2);
10004 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010005}
10006
Alexander Belopolsky40018472011-02-26 01:02:56 +000010007static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010008rsplit(PyObject *self,
10009 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010010 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010011{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010012 int kind1, kind2, kind;
10013 void *buf1, *buf2;
10014 Py_ssize_t len1, len2;
10015 PyObject* out;
10016
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010017 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010018 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010019
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010020 if (PyUnicode_READY(self) == -1)
10021 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010022
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010023 if (substring == NULL)
10024 switch(PyUnicode_KIND(self)) {
10025 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010026 if (PyUnicode_IS_ASCII(self))
10027 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010028 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010029 PyUnicode_GET_LENGTH(self), maxcount
10030 );
10031 else
10032 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010033 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010034 PyUnicode_GET_LENGTH(self), maxcount
10035 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010036 case PyUnicode_2BYTE_KIND:
10037 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010038 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010039 PyUnicode_GET_LENGTH(self), maxcount
10040 );
10041 case PyUnicode_4BYTE_KIND:
10042 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010043 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010044 PyUnicode_GET_LENGTH(self), maxcount
10045 );
10046 default:
10047 assert(0);
10048 return NULL;
10049 }
10050
10051 if (PyUnicode_READY(substring) == -1)
10052 return NULL;
10053
10054 kind1 = PyUnicode_KIND(self);
10055 kind2 = PyUnicode_KIND(substring);
10056 kind = kind1 > kind2 ? kind1 : kind2;
10057 buf1 = PyUnicode_DATA(self);
10058 buf2 = PyUnicode_DATA(substring);
10059 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010060 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010061 if (!buf1)
10062 return NULL;
10063 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010064 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010065 if (!buf2) {
10066 if (kind1 != kind) PyMem_Free(buf1);
10067 return NULL;
10068 }
10069 len1 = PyUnicode_GET_LENGTH(self);
10070 len2 = PyUnicode_GET_LENGTH(substring);
10071
10072 switch(kind) {
10073 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010074 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10075 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010076 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010077 else
10078 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010079 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010080 break;
10081 case PyUnicode_2BYTE_KIND:
10082 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010083 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010084 break;
10085 case PyUnicode_4BYTE_KIND:
10086 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010087 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010088 break;
10089 default:
10090 out = NULL;
10091 }
10092 if (kind1 != kind)
10093 PyMem_Free(buf1);
10094 if (kind2 != kind)
10095 PyMem_Free(buf2);
10096 return out;
10097}
10098
10099static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010100anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10101 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010102{
10103 switch(kind) {
10104 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010105 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10106 return asciilib_find(buf1, len1, buf2, len2, offset);
10107 else
10108 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010109 case PyUnicode_2BYTE_KIND:
10110 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10111 case PyUnicode_4BYTE_KIND:
10112 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10113 }
10114 assert(0);
10115 return -1;
10116}
10117
10118static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010119anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10120 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010121{
10122 switch(kind) {
10123 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010124 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10125 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10126 else
10127 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010128 case PyUnicode_2BYTE_KIND:
10129 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10130 case PyUnicode_4BYTE_KIND:
10131 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10132 }
10133 assert(0);
10134 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010135}
10136
Alexander Belopolsky40018472011-02-26 01:02:56 +000010137static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010138replace(PyObject *self, PyObject *str1,
10139 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010140{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010141 PyObject *u;
10142 char *sbuf = PyUnicode_DATA(self);
10143 char *buf1 = PyUnicode_DATA(str1);
10144 char *buf2 = PyUnicode_DATA(str2);
10145 int srelease = 0, release1 = 0, release2 = 0;
10146 int skind = PyUnicode_KIND(self);
10147 int kind1 = PyUnicode_KIND(str1);
10148 int kind2 = PyUnicode_KIND(str2);
10149 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10150 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10151 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010152 int mayshrink;
10153 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010154
10155 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010156 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010157 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010158 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010159
Victor Stinner59de0ee2011-10-07 10:01:28 +020010160 if (str1 == str2)
10161 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010162 if (skind < kind1)
10163 /* substring too wide to be present */
10164 goto nothing;
10165
Victor Stinner49a0a212011-10-12 23:46:10 +020010166 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10167 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10168 /* Replacing str1 with str2 may cause a maxchar reduction in the
10169 result string. */
10170 mayshrink = (maxchar_str2 < maxchar);
10171 maxchar = Py_MAX(maxchar, maxchar_str2);
10172
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010173 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +000010174 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010175 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010176 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010177 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010178 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010179 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010180 Py_UCS4 u1, u2;
10181 int rkind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010182 u1 = PyUnicode_READ_CHAR(str1, 0);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +020010183 if (findchar(sbuf, PyUnicode_KIND(self),
10184 slen, u1, 1) < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010185 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010186 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010187 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010188 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010189 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010190 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010191 rkind = PyUnicode_KIND(u);
10192 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
10193 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010194 if (--maxcount < 0)
10195 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010196 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010197 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010198 }
10199 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010200 int rkind = skind;
10201 char *res;
Victor Stinner25a4b292011-10-06 12:31:55 +020010202
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010203 if (kind1 < rkind) {
10204 /* widen substring */
10205 buf1 = _PyUnicode_AsKind(str1, rkind);
10206 if (!buf1) goto error;
10207 release1 = 1;
10208 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010209 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010210 if (i < 0)
10211 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010212 if (rkind > kind2) {
10213 /* widen replacement */
10214 buf2 = _PyUnicode_AsKind(str2, rkind);
10215 if (!buf2) goto error;
10216 release2 = 1;
10217 }
10218 else if (rkind < kind2) {
10219 /* widen self and buf1 */
10220 rkind = kind2;
10221 if (release1) PyMem_Free(buf1);
10222 sbuf = _PyUnicode_AsKind(self, rkind);
10223 if (!sbuf) goto error;
10224 srelease = 1;
10225 buf1 = _PyUnicode_AsKind(str1, rkind);
10226 if (!buf1) goto error;
10227 release1 = 1;
10228 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010229 u = PyUnicode_New(slen, maxchar);
10230 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010231 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010232 assert(PyUnicode_KIND(u) == rkind);
10233 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010234
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010235 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010236 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010237 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010238 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010239 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010240 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010241
10242 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010243 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010244 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010245 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010246 if (i == -1)
10247 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010248 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010249 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010250 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010251 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010252 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010253 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010254 }
10255 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010256 Py_ssize_t n, i, j, ires;
10257 Py_ssize_t product, new_size;
10258 int rkind = skind;
10259 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010260
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010261 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010262 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010263 buf1 = _PyUnicode_AsKind(str1, rkind);
10264 if (!buf1) goto error;
10265 release1 = 1;
10266 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010267 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010268 if (n == 0)
10269 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010270 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010271 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010272 buf2 = _PyUnicode_AsKind(str2, rkind);
10273 if (!buf2) goto error;
10274 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010275 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010276 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010277 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010278 rkind = kind2;
10279 sbuf = _PyUnicode_AsKind(self, rkind);
10280 if (!sbuf) goto error;
10281 srelease = 1;
10282 if (release1) PyMem_Free(buf1);
10283 buf1 = _PyUnicode_AsKind(str1, rkind);
10284 if (!buf1) goto error;
10285 release1 = 1;
10286 }
10287 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10288 PyUnicode_GET_LENGTH(str1))); */
10289 product = n * (len2-len1);
10290 if ((product / (len2-len1)) != n) {
10291 PyErr_SetString(PyExc_OverflowError,
10292 "replace string is too long");
10293 goto error;
10294 }
10295 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +020010296 if (new_size == 0) {
10297 Py_INCREF(unicode_empty);
10298 u = unicode_empty;
10299 goto done;
10300 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010301 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10302 PyErr_SetString(PyExc_OverflowError,
10303 "replace string is too long");
10304 goto error;
10305 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010306 u = PyUnicode_New(new_size, maxchar);
10307 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010308 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010309 assert(PyUnicode_KIND(u) == rkind);
10310 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010311 ires = i = 0;
10312 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010313 while (n-- > 0) {
10314 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010315 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010316 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010317 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010318 if (j == -1)
10319 break;
10320 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010321 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010322 memcpy(res + rkind * ires,
10323 sbuf + rkind * i,
10324 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010325 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010326 }
10327 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010328 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010329 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010330 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010331 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010332 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010333 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010334 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010335 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010336 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010337 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010338 memcpy(res + rkind * ires,
10339 sbuf + rkind * i,
10340 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010341 }
10342 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010343 /* interleave */
10344 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010345 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010346 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010347 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010348 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010349 if (--n <= 0)
10350 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010351 memcpy(res + rkind * ires,
10352 sbuf + rkind * i,
10353 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010354 ires++;
10355 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010356 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010357 memcpy(res + rkind * ires,
10358 sbuf + rkind * i,
10359 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010360 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010361 }
10362
10363 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010364 unicode_adjust_maxchar(&u);
10365 if (u == NULL)
10366 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010367 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010368
10369 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010370 if (srelease)
10371 PyMem_FREE(sbuf);
10372 if (release1)
10373 PyMem_FREE(buf1);
10374 if (release2)
10375 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010376 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010377 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010378
Benjamin Peterson29060642009-01-31 22:14:21 +000010379 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010380 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010381 if (srelease)
10382 PyMem_FREE(sbuf);
10383 if (release1)
10384 PyMem_FREE(buf1);
10385 if (release2)
10386 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010387 return unicode_result_unchanged(self);
10388
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010389 error:
10390 if (srelease && sbuf)
10391 PyMem_FREE(sbuf);
10392 if (release1 && buf1)
10393 PyMem_FREE(buf1);
10394 if (release2 && buf2)
10395 PyMem_FREE(buf2);
10396 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010397}
10398
10399/* --- Unicode Object Methods --------------------------------------------- */
10400
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010401PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010402 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010403\n\
10404Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010405characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010406
10407static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010408unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010409{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010410 return fixup(self, fixtitle);
10411}
10412
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010413PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010414 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010415\n\
10416Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010417have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010418
10419static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010420unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010421{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010422 return fixup(self, fixcapitalize);
10423}
10424
10425#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010426PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010427 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010428\n\
10429Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010430normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010431
10432static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010433unicode_capwords(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010434{
10435 PyObject *list;
10436 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010437 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010438
Guido van Rossumd57fd912000-03-10 22:53:23 +000010439 /* Split into words */
10440 list = split(self, NULL, -1);
10441 if (!list)
10442 return NULL;
10443
10444 /* Capitalize each word */
10445 for (i = 0; i < PyList_GET_SIZE(list); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010446 item = fixup(PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +000010447 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010448 if (item == NULL)
10449 goto onError;
10450 Py_DECREF(PyList_GET_ITEM(list, i));
10451 PyList_SET_ITEM(list, i, item);
10452 }
10453
10454 /* Join the words to form a new string */
10455 item = PyUnicode_Join(NULL, list);
10456
Benjamin Peterson29060642009-01-31 22:14:21 +000010457 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010458 Py_DECREF(list);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010459 return item;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010460}
10461#endif
10462
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010463/* Argument converter. Coerces to a single unicode character */
10464
10465static int
10466convert_uc(PyObject *obj, void *addr)
10467{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010468 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010469 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010470
Benjamin Peterson14339b62009-01-31 16:36:08 +000010471 uniobj = PyUnicode_FromObject(obj);
10472 if (uniobj == NULL) {
10473 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010474 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010475 return 0;
10476 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010477 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010478 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010479 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010480 Py_DECREF(uniobj);
10481 return 0;
10482 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010483 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010484 Py_DECREF(uniobj);
10485 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010486}
10487
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010488PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010489 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010490\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010491Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010492done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010493
10494static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010495unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010496{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010497 Py_ssize_t marg, left;
10498 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010499 Py_UCS4 fillchar = ' ';
10500
Victor Stinnere9a29352011-10-01 02:14:59 +020010501 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010502 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010503
Victor Stinnerc4b49542011-12-11 22:44:26 +010010504 if (PyUnicode_READY(self) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010505 return NULL;
10506
Victor Stinnerc4b49542011-12-11 22:44:26 +010010507 if (PyUnicode_GET_LENGTH(self) >= width)
10508 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010509
Victor Stinnerc4b49542011-12-11 22:44:26 +010010510 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010511 left = marg / 2 + (marg & width & 1);
10512
Victor Stinner9310abb2011-10-05 00:59:23 +020010513 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010514}
10515
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010516/* This function assumes that str1 and str2 are readied by the caller. */
10517
Marc-André Lemburge5034372000-08-08 08:04:29 +000010518static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010519unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010520{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010521 int kind1, kind2;
10522 void *data1, *data2;
10523 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010524
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010525 kind1 = PyUnicode_KIND(str1);
10526 kind2 = PyUnicode_KIND(str2);
10527 data1 = PyUnicode_DATA(str1);
10528 data2 = PyUnicode_DATA(str2);
10529 len1 = PyUnicode_GET_LENGTH(str1);
10530 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010531
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010532 for (i = 0; i < len1 && i < len2; ++i) {
10533 Py_UCS4 c1, c2;
10534 c1 = PyUnicode_READ(kind1, data1, i);
10535 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010536
10537 if (c1 != c2)
10538 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010539 }
10540
10541 return (len1 < len2) ? -1 : (len1 != len2);
10542}
10543
Alexander Belopolsky40018472011-02-26 01:02:56 +000010544int
10545PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010546{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010547 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10548 if (PyUnicode_READY(left) == -1 ||
10549 PyUnicode_READY(right) == -1)
10550 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010551 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010552 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010553 PyErr_Format(PyExc_TypeError,
10554 "Can't compare %.100s and %.100s",
10555 left->ob_type->tp_name,
10556 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010557 return -1;
10558}
10559
Martin v. Löwis5b222132007-06-10 09:51:05 +000010560int
10561PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10562{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010563 Py_ssize_t i;
10564 int kind;
10565 void *data;
10566 Py_UCS4 chr;
10567
Victor Stinner910337b2011-10-03 03:20:16 +020010568 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010569 if (PyUnicode_READY(uni) == -1)
10570 return -1;
10571 kind = PyUnicode_KIND(uni);
10572 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010573 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010574 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10575 if (chr != str[i])
10576 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010577 /* This check keeps Python strings that end in '\0' from comparing equal
10578 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010579 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010580 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010581 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010582 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010583 return 0;
10584}
10585
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010586
Benjamin Peterson29060642009-01-31 22:14:21 +000010587#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010588 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010589
Alexander Belopolsky40018472011-02-26 01:02:56 +000010590PyObject *
10591PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010592{
10593 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010594
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010595 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10596 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010597 if (PyUnicode_READY(left) == -1 ||
10598 PyUnicode_READY(right) == -1)
10599 return NULL;
10600 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10601 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010602 if (op == Py_EQ) {
10603 Py_INCREF(Py_False);
10604 return Py_False;
10605 }
10606 if (op == Py_NE) {
10607 Py_INCREF(Py_True);
10608 return Py_True;
10609 }
10610 }
10611 if (left == right)
10612 result = 0;
10613 else
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010614 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010615
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010616 /* Convert the return value to a Boolean */
10617 switch (op) {
10618 case Py_EQ:
10619 v = TEST_COND(result == 0);
10620 break;
10621 case Py_NE:
10622 v = TEST_COND(result != 0);
10623 break;
10624 case Py_LE:
10625 v = TEST_COND(result <= 0);
10626 break;
10627 case Py_GE:
10628 v = TEST_COND(result >= 0);
10629 break;
10630 case Py_LT:
10631 v = TEST_COND(result == -1);
10632 break;
10633 case Py_GT:
10634 v = TEST_COND(result == 1);
10635 break;
10636 default:
10637 PyErr_BadArgument();
10638 return NULL;
10639 }
10640 Py_INCREF(v);
10641 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010642 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010643
Brian Curtindfc80e32011-08-10 20:28:54 -050010644 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010645}
10646
Alexander Belopolsky40018472011-02-26 01:02:56 +000010647int
10648PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010649{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010650 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010651 int kind1, kind2, kind;
10652 void *buf1, *buf2;
10653 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010654 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010655
10656 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010657 sub = PyUnicode_FromObject(element);
10658 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010659 PyErr_Format(PyExc_TypeError,
10660 "'in <string>' requires string as left operand, not %s",
10661 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010662 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010663 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010664 if (PyUnicode_READY(sub) == -1)
10665 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010666
Thomas Wouters477c8d52006-05-27 19:21:47 +000010667 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +020010668 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010669 Py_DECREF(sub);
10670 return -1;
10671 }
10672
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010673 kind1 = PyUnicode_KIND(str);
10674 kind2 = PyUnicode_KIND(sub);
10675 kind = kind1 > kind2 ? kind1 : kind2;
10676 buf1 = PyUnicode_DATA(str);
10677 buf2 = PyUnicode_DATA(sub);
10678 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010679 buf1 = _PyUnicode_AsKind(str, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010680 if (!buf1) {
10681 Py_DECREF(sub);
10682 return -1;
10683 }
10684 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010685 buf2 = _PyUnicode_AsKind(sub, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010686 if (!buf2) {
10687 Py_DECREF(sub);
10688 if (kind1 != kind) PyMem_Free(buf1);
10689 return -1;
10690 }
10691 len1 = PyUnicode_GET_LENGTH(str);
10692 len2 = PyUnicode_GET_LENGTH(sub);
10693
10694 switch(kind) {
10695 case PyUnicode_1BYTE_KIND:
10696 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10697 break;
10698 case PyUnicode_2BYTE_KIND:
10699 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10700 break;
10701 case PyUnicode_4BYTE_KIND:
10702 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10703 break;
10704 default:
10705 result = -1;
10706 assert(0);
10707 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010708
10709 Py_DECREF(str);
10710 Py_DECREF(sub);
10711
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010712 if (kind1 != kind)
10713 PyMem_Free(buf1);
10714 if (kind2 != kind)
10715 PyMem_Free(buf2);
10716
Guido van Rossum403d68b2000-03-13 15:55:09 +000010717 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010718}
10719
Guido van Rossumd57fd912000-03-10 22:53:23 +000010720/* Concat to string or Unicode object giving a new Unicode object. */
10721
Alexander Belopolsky40018472011-02-26 01:02:56 +000010722PyObject *
10723PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010724{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010725 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010726 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010727 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010728
10729 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010730 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010731 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010732 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010733 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010734 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010735 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010736
10737 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010738 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010739 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010740 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010741 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010742 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010743 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010744 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010745 }
10746
Victor Stinner488fa492011-12-12 00:01:39 +010010747 u_len = PyUnicode_GET_LENGTH(u);
10748 v_len = PyUnicode_GET_LENGTH(v);
10749 if (u_len > PY_SSIZE_T_MAX - v_len) {
10750 PyErr_SetString(PyExc_OverflowError,
10751 "strings are too large to concat");
10752 goto onError;
10753 }
10754 new_len = u_len + v_len;
10755
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010756 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010757 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
10758 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010759
Guido van Rossumd57fd912000-03-10 22:53:23 +000010760 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010010761 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010762 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010763 goto onError;
Victor Stinner488fa492011-12-12 00:01:39 +010010764 copy_characters(w, 0, u, 0, u_len);
10765 copy_characters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010766 Py_DECREF(u);
10767 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010768 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010769 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010770
Benjamin Peterson29060642009-01-31 22:14:21 +000010771 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010772 Py_XDECREF(u);
10773 Py_XDECREF(v);
10774 return NULL;
10775}
10776
Walter Dörwald1ab83302007-05-18 17:15:44 +000010777void
Victor Stinner23e56682011-10-03 03:54:37 +020010778PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010779{
Victor Stinner23e56682011-10-03 03:54:37 +020010780 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010010781 Py_UCS4 maxchar, maxchar2;
10782 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020010783
10784 if (p_left == NULL) {
10785 if (!PyErr_Occurred())
10786 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010787 return;
10788 }
Victor Stinner23e56682011-10-03 03:54:37 +020010789 left = *p_left;
10790 if (right == NULL || !PyUnicode_Check(left)) {
10791 if (!PyErr_Occurred())
10792 PyErr_BadInternalCall();
10793 goto error;
10794 }
10795
Victor Stinnere1335c72011-10-04 20:53:03 +020010796 if (PyUnicode_READY(left))
10797 goto error;
10798 if (PyUnicode_READY(right))
10799 goto error;
10800
Victor Stinner488fa492011-12-12 00:01:39 +010010801 /* Shortcuts */
10802 if (left == unicode_empty) {
10803 Py_DECREF(left);
10804 Py_INCREF(right);
10805 *p_left = right;
10806 return;
10807 }
10808 if (right == unicode_empty)
10809 return;
10810
10811 left_len = PyUnicode_GET_LENGTH(left);
10812 right_len = PyUnicode_GET_LENGTH(right);
10813 if (left_len > PY_SSIZE_T_MAX - right_len) {
10814 PyErr_SetString(PyExc_OverflowError,
10815 "strings are too large to concat");
10816 goto error;
10817 }
10818 new_len = left_len + right_len;
10819
10820 if (unicode_modifiable(left)
10821 && PyUnicode_CheckExact(right)
10822 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020010823 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10824 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010825 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010826 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010010827 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10828 {
10829 /* append inplace */
10830 if (unicode_resize(p_left, new_len) != 0) {
10831 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10832 * deallocated so it cannot be put back into
10833 * 'variable'. The MemoryError is raised when there
10834 * is no value in 'variable', which might (very
10835 * remotely) be a cause of incompatibilities.
10836 */
10837 goto error;
Victor Stinner23e56682011-10-03 03:54:37 +020010838 }
Victor Stinner488fa492011-12-12 00:01:39 +010010839 /* copy 'right' into the newly allocated area of 'left' */
10840 copy_characters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020010841 }
Victor Stinner488fa492011-12-12 00:01:39 +010010842 else {
10843 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
10844 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
10845 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020010846
Victor Stinner488fa492011-12-12 00:01:39 +010010847 /* Concat the two Unicode strings */
10848 res = PyUnicode_New(new_len, maxchar);
10849 if (res == NULL)
10850 goto error;
10851 copy_characters(res, 0, left, 0, left_len);
10852 copy_characters(res, left_len, right, 0, right_len);
10853 Py_DECREF(left);
10854 *p_left = res;
10855 }
10856 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010857 return;
10858
10859error:
Victor Stinner488fa492011-12-12 00:01:39 +010010860 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010861}
10862
10863void
10864PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10865{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010866 PyUnicode_Append(pleft, right);
10867 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010868}
10869
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010870PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010871 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010872\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010873Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010874string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010875interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010876
10877static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010878unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010879{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010880 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010881 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010882 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010883 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010884 int kind1, kind2, kind;
10885 void *buf1, *buf2;
10886 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010887
Jesus Ceaac451502011-04-20 17:09:23 +020010888 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10889 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010890 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010891
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010892 kind1 = PyUnicode_KIND(self);
10893 kind2 = PyUnicode_KIND(substring);
10894 kind = kind1 > kind2 ? kind1 : kind2;
10895 buf1 = PyUnicode_DATA(self);
10896 buf2 = PyUnicode_DATA(substring);
10897 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010898 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010899 if (!buf1) {
10900 Py_DECREF(substring);
10901 return NULL;
10902 }
10903 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010904 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010905 if (!buf2) {
10906 Py_DECREF(substring);
10907 if (kind1 != kind) PyMem_Free(buf1);
10908 return NULL;
10909 }
10910 len1 = PyUnicode_GET_LENGTH(self);
10911 len2 = PyUnicode_GET_LENGTH(substring);
10912
10913 ADJUST_INDICES(start, end, len1);
10914 switch(kind) {
10915 case PyUnicode_1BYTE_KIND:
10916 iresult = ucs1lib_count(
10917 ((Py_UCS1*)buf1) + start, end - start,
10918 buf2, len2, PY_SSIZE_T_MAX
10919 );
10920 break;
10921 case PyUnicode_2BYTE_KIND:
10922 iresult = ucs2lib_count(
10923 ((Py_UCS2*)buf1) + start, end - start,
10924 buf2, len2, PY_SSIZE_T_MAX
10925 );
10926 break;
10927 case PyUnicode_4BYTE_KIND:
10928 iresult = ucs4lib_count(
10929 ((Py_UCS4*)buf1) + start, end - start,
10930 buf2, len2, PY_SSIZE_T_MAX
10931 );
10932 break;
10933 default:
10934 assert(0); iresult = 0;
10935 }
10936
10937 result = PyLong_FromSsize_t(iresult);
10938
10939 if (kind1 != kind)
10940 PyMem_Free(buf1);
10941 if (kind2 != kind)
10942 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010943
10944 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010945
Guido van Rossumd57fd912000-03-10 22:53:23 +000010946 return result;
10947}
10948
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010949PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010950 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010951\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010952Encode S using the codec registered for encoding. Default encoding\n\
10953is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010954handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010955a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10956'xmlcharrefreplace' as well as any other name registered with\n\
10957codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010958
10959static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010960unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010961{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010962 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010963 char *encoding = NULL;
10964 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010965
Benjamin Peterson308d6372009-09-18 21:42:35 +000010966 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10967 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010968 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010969 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010970}
10971
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010972PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010973 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010974\n\
10975Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010976If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010977
10978static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010979unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010980{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010981 Py_ssize_t i, j, line_pos, src_len, incr;
10982 Py_UCS4 ch;
10983 PyObject *u;
10984 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010985 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010986 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010987 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010988
10989 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010990 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010991
Antoine Pitrou22425222011-10-04 19:10:51 +020010992 if (PyUnicode_READY(self) == -1)
10993 return NULL;
10994
Thomas Wouters7e474022000-07-16 12:04:32 +000010995 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010996 src_len = PyUnicode_GET_LENGTH(self);
10997 i = j = line_pos = 0;
10998 kind = PyUnicode_KIND(self);
10999 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011000 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011001 for (; i < src_len; i++) {
11002 ch = PyUnicode_READ(kind, src_data, i);
11003 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011004 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011005 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011006 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011007 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011008 goto overflow;
11009 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011010 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011011 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011012 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011013 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011014 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011015 goto overflow;
11016 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011017 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011018 if (ch == '\n' || ch == '\r')
11019 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011020 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011021 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011022 if (!found)
11023 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011024
Guido van Rossumd57fd912000-03-10 22:53:23 +000011025 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011026 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011027 if (!u)
11028 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011029 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011030
Antoine Pitroue71d5742011-10-04 15:55:09 +020011031 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011032
Antoine Pitroue71d5742011-10-04 15:55:09 +020011033 for (; i < src_len; i++) {
11034 ch = PyUnicode_READ(kind, src_data, i);
11035 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011036 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011037 incr = tabsize - (line_pos % tabsize);
11038 line_pos += incr;
11039 while (incr--) {
11040 PyUnicode_WRITE(kind, dest_data, j, ' ');
11041 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011042 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011043 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011044 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011045 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011046 line_pos++;
11047 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011048 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011049 if (ch == '\n' || ch == '\r')
11050 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011051 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011052 }
11053 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011054 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011055
Antoine Pitroue71d5742011-10-04 15:55:09 +020011056 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011057 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11058 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011059}
11060
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011061PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011062 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011063\n\
11064Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011065such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011066arguments start and end are interpreted as in slice notation.\n\
11067\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011068Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011069
11070static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011071unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011072{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011073 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011074 Py_ssize_t start;
11075 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011076 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011077
Jesus Ceaac451502011-04-20 17:09:23 +020011078 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11079 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011080 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011081
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011082 if (PyUnicode_READY(self) == -1)
11083 return NULL;
11084 if (PyUnicode_READY(substring) == -1)
11085 return NULL;
11086
Victor Stinner7931d9a2011-11-04 00:22:48 +010011087 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011088
11089 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011090
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011091 if (result == -2)
11092 return NULL;
11093
Christian Heimes217cfd12007-12-02 14:31:20 +000011094 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011095}
11096
11097static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011098unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011099{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011100 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
11101 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011102 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011103 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011104}
11105
Guido van Rossumc2504932007-09-18 19:42:40 +000011106/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011107 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011108static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011109unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011110{
Guido van Rossumc2504932007-09-18 19:42:40 +000011111 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010011112 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011113
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011114 if (_PyUnicode_HASH(self) != -1)
11115 return _PyUnicode_HASH(self);
11116 if (PyUnicode_READY(self) == -1)
11117 return -1;
11118 len = PyUnicode_GET_LENGTH(self);
11119
11120 /* The hash function as a macro, gets expanded three times below. */
11121#define HASH(P) \
11122 x = (Py_uhash_t)*P << 7; \
11123 while (--len >= 0) \
11124 x = (1000003*x) ^ (Py_uhash_t)*P++;
11125
11126 switch (PyUnicode_KIND(self)) {
11127 case PyUnicode_1BYTE_KIND: {
11128 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11129 HASH(c);
11130 break;
11131 }
11132 case PyUnicode_2BYTE_KIND: {
11133 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11134 HASH(s);
11135 break;
11136 }
11137 default: {
11138 Py_UCS4 *l;
11139 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11140 "Impossible switch case in unicode_hash");
11141 l = PyUnicode_4BYTE_DATA(self);
11142 HASH(l);
11143 break;
11144 }
11145 }
11146 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
11147
Guido van Rossumc2504932007-09-18 19:42:40 +000011148 if (x == -1)
11149 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011150 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011151 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011152}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011153#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011154
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011155PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011156 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011157\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011158Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011159
11160static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011161unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011162{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011163 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011164 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011165 Py_ssize_t start;
11166 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011167
Jesus Ceaac451502011-04-20 17:09:23 +020011168 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11169 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011170 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011171
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011172 if (PyUnicode_READY(self) == -1)
11173 return NULL;
11174 if (PyUnicode_READY(substring) == -1)
11175 return NULL;
11176
Victor Stinner7931d9a2011-11-04 00:22:48 +010011177 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011178
11179 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011180
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011181 if (result == -2)
11182 return NULL;
11183
Guido van Rossumd57fd912000-03-10 22:53:23 +000011184 if (result < 0) {
11185 PyErr_SetString(PyExc_ValueError, "substring not found");
11186 return NULL;
11187 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011188
Christian Heimes217cfd12007-12-02 14:31:20 +000011189 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011190}
11191
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011192PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011193 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011194\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011195Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011196at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011197
11198static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011199unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011200{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011201 Py_ssize_t i, length;
11202 int kind;
11203 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011204 int cased;
11205
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011206 if (PyUnicode_READY(self) == -1)
11207 return NULL;
11208 length = PyUnicode_GET_LENGTH(self);
11209 kind = PyUnicode_KIND(self);
11210 data = PyUnicode_DATA(self);
11211
Guido van Rossumd57fd912000-03-10 22:53:23 +000011212 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011213 if (length == 1)
11214 return PyBool_FromLong(
11215 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011216
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011217 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011218 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011219 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011220
Guido van Rossumd57fd912000-03-10 22:53:23 +000011221 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011222 for (i = 0; i < length; i++) {
11223 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011224
Benjamin Peterson29060642009-01-31 22:14:21 +000011225 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11226 return PyBool_FromLong(0);
11227 else if (!cased && Py_UNICODE_ISLOWER(ch))
11228 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011229 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011230 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011231}
11232
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011233PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011234 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011235\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011236Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011237at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011238
11239static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011240unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011241{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011242 Py_ssize_t i, length;
11243 int kind;
11244 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011245 int cased;
11246
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011247 if (PyUnicode_READY(self) == -1)
11248 return NULL;
11249 length = PyUnicode_GET_LENGTH(self);
11250 kind = PyUnicode_KIND(self);
11251 data = PyUnicode_DATA(self);
11252
Guido van Rossumd57fd912000-03-10 22:53:23 +000011253 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011254 if (length == 1)
11255 return PyBool_FromLong(
11256 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011257
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011258 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011259 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011260 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011261
Guido van Rossumd57fd912000-03-10 22:53:23 +000011262 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011263 for (i = 0; i < length; i++) {
11264 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011265
Benjamin Peterson29060642009-01-31 22:14:21 +000011266 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11267 return PyBool_FromLong(0);
11268 else if (!cased && Py_UNICODE_ISUPPER(ch))
11269 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011270 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011271 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011272}
11273
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011274PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011275 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011276\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011277Return True if S is a titlecased string and there is at least one\n\
11278character in S, i.e. upper- and titlecase characters may only\n\
11279follow uncased characters and lowercase characters only cased ones.\n\
11280Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011281
11282static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011283unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011284{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011285 Py_ssize_t i, length;
11286 int kind;
11287 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011288 int cased, previous_is_cased;
11289
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011290 if (PyUnicode_READY(self) == -1)
11291 return NULL;
11292 length = PyUnicode_GET_LENGTH(self);
11293 kind = PyUnicode_KIND(self);
11294 data = PyUnicode_DATA(self);
11295
Guido van Rossumd57fd912000-03-10 22:53:23 +000011296 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011297 if (length == 1) {
11298 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11299 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11300 (Py_UNICODE_ISUPPER(ch) != 0));
11301 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011302
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011303 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011304 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011305 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011306
Guido van Rossumd57fd912000-03-10 22:53:23 +000011307 cased = 0;
11308 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011309 for (i = 0; i < length; i++) {
11310 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011311
Benjamin Peterson29060642009-01-31 22:14:21 +000011312 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11313 if (previous_is_cased)
11314 return PyBool_FromLong(0);
11315 previous_is_cased = 1;
11316 cased = 1;
11317 }
11318 else if (Py_UNICODE_ISLOWER(ch)) {
11319 if (!previous_is_cased)
11320 return PyBool_FromLong(0);
11321 previous_is_cased = 1;
11322 cased = 1;
11323 }
11324 else
11325 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011326 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011327 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011328}
11329
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011330PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011331 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011332\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011333Return True if all characters in S are whitespace\n\
11334and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011335
11336static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011337unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011338{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011339 Py_ssize_t i, length;
11340 int kind;
11341 void *data;
11342
11343 if (PyUnicode_READY(self) == -1)
11344 return NULL;
11345 length = PyUnicode_GET_LENGTH(self);
11346 kind = PyUnicode_KIND(self);
11347 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011348
Guido van Rossumd57fd912000-03-10 22:53:23 +000011349 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011350 if (length == 1)
11351 return PyBool_FromLong(
11352 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011353
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011354 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011355 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011356 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011357
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011358 for (i = 0; i < length; i++) {
11359 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011360 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011361 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011362 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011363 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011364}
11365
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011366PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011367 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011368\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011369Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011370and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011371
11372static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011373unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011374{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011375 Py_ssize_t i, length;
11376 int kind;
11377 void *data;
11378
11379 if (PyUnicode_READY(self) == -1)
11380 return NULL;
11381 length = PyUnicode_GET_LENGTH(self);
11382 kind = PyUnicode_KIND(self);
11383 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011384
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011385 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011386 if (length == 1)
11387 return PyBool_FromLong(
11388 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011389
11390 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011391 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011392 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011393
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011394 for (i = 0; i < length; i++) {
11395 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011396 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011397 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011398 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011399}
11400
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011401PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011402 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011403\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011404Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011405and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011406
11407static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011408unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011409{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011410 int kind;
11411 void *data;
11412 Py_ssize_t len, i;
11413
11414 if (PyUnicode_READY(self) == -1)
11415 return NULL;
11416
11417 kind = PyUnicode_KIND(self);
11418 data = PyUnicode_DATA(self);
11419 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011420
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011421 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011422 if (len == 1) {
11423 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11424 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11425 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011426
11427 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011428 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011429 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011430
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011431 for (i = 0; i < len; i++) {
11432 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011433 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011434 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011435 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011436 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011437}
11438
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011439PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011440 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011441\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011442Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011443False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011444
11445static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011446unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011447{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011448 Py_ssize_t i, length;
11449 int kind;
11450 void *data;
11451
11452 if (PyUnicode_READY(self) == -1)
11453 return NULL;
11454 length = PyUnicode_GET_LENGTH(self);
11455 kind = PyUnicode_KIND(self);
11456 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011457
Guido van Rossumd57fd912000-03-10 22:53:23 +000011458 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011459 if (length == 1)
11460 return PyBool_FromLong(
11461 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011462
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011463 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011464 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011465 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011466
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011467 for (i = 0; i < length; i++) {
11468 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011469 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011470 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011471 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011472}
11473
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011474PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011475 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011476\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011477Return True if all characters in S are digits\n\
11478and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011479
11480static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011481unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011482{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011483 Py_ssize_t i, length;
11484 int kind;
11485 void *data;
11486
11487 if (PyUnicode_READY(self) == -1)
11488 return NULL;
11489 length = PyUnicode_GET_LENGTH(self);
11490 kind = PyUnicode_KIND(self);
11491 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011492
Guido van Rossumd57fd912000-03-10 22:53:23 +000011493 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011494 if (length == 1) {
11495 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11496 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11497 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011498
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011499 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011500 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011501 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011502
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011503 for (i = 0; i < length; i++) {
11504 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011505 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011506 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011507 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011508}
11509
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011510PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011511 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011512\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011513Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011514False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011515
11516static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011517unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011518{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011519 Py_ssize_t i, length;
11520 int kind;
11521 void *data;
11522
11523 if (PyUnicode_READY(self) == -1)
11524 return NULL;
11525 length = PyUnicode_GET_LENGTH(self);
11526 kind = PyUnicode_KIND(self);
11527 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011528
Guido van Rossumd57fd912000-03-10 22:53:23 +000011529 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011530 if (length == 1)
11531 return PyBool_FromLong(
11532 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011533
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011534 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011535 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011536 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011537
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011538 for (i = 0; i < length; i++) {
11539 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011540 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011541 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011542 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011543}
11544
Martin v. Löwis47383402007-08-15 07:32:56 +000011545int
11546PyUnicode_IsIdentifier(PyObject *self)
11547{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011548 int kind;
11549 void *data;
11550 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011551 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011552
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011553 if (PyUnicode_READY(self) == -1) {
11554 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011555 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011556 }
11557
11558 /* Special case for empty strings */
11559 if (PyUnicode_GET_LENGTH(self) == 0)
11560 return 0;
11561 kind = PyUnicode_KIND(self);
11562 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011563
11564 /* PEP 3131 says that the first character must be in
11565 XID_Start and subsequent characters in XID_Continue,
11566 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011567 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011568 letters, digits, underscore). However, given the current
11569 definition of XID_Start and XID_Continue, it is sufficient
11570 to check just for these, except that _ must be allowed
11571 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011572 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011573 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011574 return 0;
11575
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011576 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011577 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011578 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011579 return 1;
11580}
11581
11582PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011583 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011584\n\
11585Return True if S is a valid identifier according\n\
11586to the language definition.");
11587
11588static PyObject*
11589unicode_isidentifier(PyObject *self)
11590{
11591 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11592}
11593
Georg Brandl559e5d72008-06-11 18:37:52 +000011594PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011595 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011596\n\
11597Return True if all characters in S are considered\n\
11598printable in repr() or S is empty, False otherwise.");
11599
11600static PyObject*
11601unicode_isprintable(PyObject *self)
11602{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011603 Py_ssize_t i, length;
11604 int kind;
11605 void *data;
11606
11607 if (PyUnicode_READY(self) == -1)
11608 return NULL;
11609 length = PyUnicode_GET_LENGTH(self);
11610 kind = PyUnicode_KIND(self);
11611 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011612
11613 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011614 if (length == 1)
11615 return PyBool_FromLong(
11616 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011617
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011618 for (i = 0; i < length; i++) {
11619 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011620 Py_RETURN_FALSE;
11621 }
11622 }
11623 Py_RETURN_TRUE;
11624}
11625
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011626PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011627 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011628\n\
11629Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011630iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011631
11632static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011633unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011634{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011635 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011636}
11637
Martin v. Löwis18e16552006-02-15 17:27:45 +000011638static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011639unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011640{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011641 if (PyUnicode_READY(self) == -1)
11642 return -1;
11643 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011644}
11645
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011646PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011647 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011648\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011649Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011650done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011651
11652static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011653unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011654{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011655 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011656 Py_UCS4 fillchar = ' ';
11657
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011658 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011659 return NULL;
11660
Victor Stinnerc4b49542011-12-11 22:44:26 +010011661 if (PyUnicode_READY(self) < 0)
11662 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011663
Victor Stinnerc4b49542011-12-11 22:44:26 +010011664 if (PyUnicode_GET_LENGTH(self) >= width)
11665 return unicode_result_unchanged(self);
11666
11667 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011668}
11669
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011670PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011671 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011672\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011673Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011674
11675static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011676unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011677{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011678 return fixup(self, fixlower);
11679}
11680
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011681#define LEFTSTRIP 0
11682#define RIGHTSTRIP 1
11683#define BOTHSTRIP 2
11684
11685/* Arrays indexed by above */
11686static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11687
11688#define STRIPNAME(i) (stripformat[i]+3)
11689
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011690/* externally visible for str.strip(unicode) */
11691PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011692_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011693{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011694 void *data;
11695 int kind;
11696 Py_ssize_t i, j, len;
11697 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011698
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011699 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11700 return NULL;
11701
11702 kind = PyUnicode_KIND(self);
11703 data = PyUnicode_DATA(self);
11704 len = PyUnicode_GET_LENGTH(self);
11705 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11706 PyUnicode_DATA(sepobj),
11707 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011708
Benjamin Peterson14339b62009-01-31 16:36:08 +000011709 i = 0;
11710 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011711 while (i < len &&
11712 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011713 i++;
11714 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011715 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011716
Benjamin Peterson14339b62009-01-31 16:36:08 +000011717 j = len;
11718 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011719 do {
11720 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011721 } while (j >= i &&
11722 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011723 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011724 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011725
Victor Stinner7931d9a2011-11-04 00:22:48 +010011726 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011727}
11728
11729PyObject*
11730PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11731{
11732 unsigned char *data;
11733 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011734 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011735
Victor Stinnerde636f32011-10-01 03:55:54 +020011736 if (PyUnicode_READY(self) == -1)
11737 return NULL;
11738
11739 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11740
Victor Stinner12bab6d2011-10-01 01:53:49 +020011741 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Victor Stinnerc4b49542011-12-11 22:44:26 +010011742 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011743
Victor Stinner12bab6d2011-10-01 01:53:49 +020011744 length = end - start;
11745 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011746 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011747
Victor Stinnerde636f32011-10-01 03:55:54 +020011748 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011749 PyErr_SetString(PyExc_IndexError, "string index out of range");
11750 return NULL;
11751 }
11752
Victor Stinnerb9275c12011-10-05 14:01:42 +020011753 if (PyUnicode_IS_ASCII(self)) {
11754 kind = PyUnicode_KIND(self);
11755 data = PyUnicode_1BYTE_DATA(self);
11756 return unicode_fromascii(data + start, length);
11757 }
11758 else {
11759 kind = PyUnicode_KIND(self);
11760 data = PyUnicode_1BYTE_DATA(self);
11761 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011762 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011763 length);
11764 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011765}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011766
11767static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011768do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011769{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011770 int kind;
11771 void *data;
11772 Py_ssize_t len, i, j;
11773
11774 if (PyUnicode_READY(self) == -1)
11775 return NULL;
11776
11777 kind = PyUnicode_KIND(self);
11778 data = PyUnicode_DATA(self);
11779 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011780
Benjamin Peterson14339b62009-01-31 16:36:08 +000011781 i = 0;
11782 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011783 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011784 i++;
11785 }
11786 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011787
Benjamin Peterson14339b62009-01-31 16:36:08 +000011788 j = len;
11789 if (striptype != LEFTSTRIP) {
11790 do {
11791 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011792 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011793 j++;
11794 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011795
Victor Stinner7931d9a2011-11-04 00:22:48 +010011796 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011797}
11798
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011799
11800static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011801do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011802{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011803 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011804
Benjamin Peterson14339b62009-01-31 16:36:08 +000011805 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11806 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011807
Benjamin Peterson14339b62009-01-31 16:36:08 +000011808 if (sep != NULL && sep != Py_None) {
11809 if (PyUnicode_Check(sep))
11810 return _PyUnicode_XStrip(self, striptype, sep);
11811 else {
11812 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011813 "%s arg must be None or str",
11814 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011815 return NULL;
11816 }
11817 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011818
Benjamin Peterson14339b62009-01-31 16:36:08 +000011819 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011820}
11821
11822
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011823PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011824 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011825\n\
11826Return a copy of the string S with leading and trailing\n\
11827whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011828If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011829
11830static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011831unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011832{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011833 if (PyTuple_GET_SIZE(args) == 0)
11834 return do_strip(self, BOTHSTRIP); /* Common case */
11835 else
11836 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011837}
11838
11839
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011840PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011841 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011842\n\
11843Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011844If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011845
11846static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011847unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011848{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011849 if (PyTuple_GET_SIZE(args) == 0)
11850 return do_strip(self, LEFTSTRIP); /* Common case */
11851 else
11852 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011853}
11854
11855
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011856PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011857 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011858\n\
11859Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011860If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011861
11862static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011863unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011864{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011865 if (PyTuple_GET_SIZE(args) == 0)
11866 return do_strip(self, RIGHTSTRIP); /* Common case */
11867 else
11868 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011869}
11870
11871
Guido van Rossumd57fd912000-03-10 22:53:23 +000011872static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011873unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011874{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011875 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011876 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011877
Georg Brandl222de0f2009-04-12 12:01:50 +000011878 if (len < 1) {
11879 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011880 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011881 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011882
Victor Stinnerc4b49542011-12-11 22:44:26 +010011883 /* no repeat, return original string */
11884 if (len == 1)
11885 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000011886
Victor Stinnerc4b49542011-12-11 22:44:26 +010011887 if (PyUnicode_READY(str) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011888 return NULL;
11889
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011890 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011891 PyErr_SetString(PyExc_OverflowError,
11892 "repeated string is too long");
11893 return NULL;
11894 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011895 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011896
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011897 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011898 if (!u)
11899 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011900 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011901
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011902 if (PyUnicode_GET_LENGTH(str) == 1) {
11903 const int kind = PyUnicode_KIND(str);
11904 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11905 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011906 if (kind == PyUnicode_1BYTE_KIND)
11907 memset(to, (unsigned char)fill_char, len);
11908 else {
11909 for (n = 0; n < len; ++n)
11910 PyUnicode_WRITE(kind, to, n, fill_char);
11911 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011912 }
11913 else {
11914 /* number of characters copied this far */
11915 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011916 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011917 char *to = (char *) PyUnicode_DATA(u);
11918 Py_MEMCPY(to, PyUnicode_DATA(str),
11919 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011920 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011921 n = (done <= nchars-done) ? done : nchars-done;
11922 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011923 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011924 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011925 }
11926
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011927 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011928 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011929}
11930
Alexander Belopolsky40018472011-02-26 01:02:56 +000011931PyObject *
11932PyUnicode_Replace(PyObject *obj,
11933 PyObject *subobj,
11934 PyObject *replobj,
11935 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011936{
11937 PyObject *self;
11938 PyObject *str1;
11939 PyObject *str2;
11940 PyObject *result;
11941
11942 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011943 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011944 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011945 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011946 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011947 Py_DECREF(self);
11948 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011949 }
11950 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011951 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011952 Py_DECREF(self);
11953 Py_DECREF(str1);
11954 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011955 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011956 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011957 Py_DECREF(self);
11958 Py_DECREF(str1);
11959 Py_DECREF(str2);
11960 return result;
11961}
11962
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011963PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011964 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011965\n\
11966Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011967old replaced by new. If the optional argument count is\n\
11968given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011969
11970static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011971unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011972{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011973 PyObject *str1;
11974 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011975 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011976 PyObject *result;
11977
Martin v. Löwis18e16552006-02-15 17:27:45 +000011978 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011979 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011980 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011981 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011982 str1 = PyUnicode_FromObject(str1);
11983 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11984 return NULL;
11985 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011986 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011987 Py_DECREF(str1);
11988 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011989 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011990
11991 result = replace(self, str1, str2, maxcount);
11992
11993 Py_DECREF(str1);
11994 Py_DECREF(str2);
11995 return result;
11996}
11997
Alexander Belopolsky40018472011-02-26 01:02:56 +000011998static PyObject *
11999unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012000{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012001 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012002 Py_ssize_t isize;
12003 Py_ssize_t osize, squote, dquote, i, o;
12004 Py_UCS4 max, quote;
12005 int ikind, okind;
12006 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012007
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012008 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012009 return NULL;
12010
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012011 isize = PyUnicode_GET_LENGTH(unicode);
12012 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012013
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012014 /* Compute length of output, quote characters, and
12015 maximum character */
12016 osize = 2; /* quotes */
12017 max = 127;
12018 squote = dquote = 0;
12019 ikind = PyUnicode_KIND(unicode);
12020 for (i = 0; i < isize; i++) {
12021 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12022 switch (ch) {
12023 case '\'': squote++; osize++; break;
12024 case '"': dquote++; osize++; break;
12025 case '\\': case '\t': case '\r': case '\n':
12026 osize += 2; break;
12027 default:
12028 /* Fast-path ASCII */
12029 if (ch < ' ' || ch == 0x7f)
12030 osize += 4; /* \xHH */
12031 else if (ch < 0x7f)
12032 osize++;
12033 else if (Py_UNICODE_ISPRINTABLE(ch)) {
12034 osize++;
12035 max = ch > max ? ch : max;
12036 }
12037 else if (ch < 0x100)
12038 osize += 4; /* \xHH */
12039 else if (ch < 0x10000)
12040 osize += 6; /* \uHHHH */
12041 else
12042 osize += 10; /* \uHHHHHHHH */
12043 }
12044 }
12045
12046 quote = '\'';
12047 if (squote) {
12048 if (dquote)
12049 /* Both squote and dquote present. Use squote,
12050 and escape them */
12051 osize += squote;
12052 else
12053 quote = '"';
12054 }
12055
12056 repr = PyUnicode_New(osize, max);
12057 if (repr == NULL)
12058 return NULL;
12059 okind = PyUnicode_KIND(repr);
12060 odata = PyUnicode_DATA(repr);
12061
12062 PyUnicode_WRITE(okind, odata, 0, quote);
12063 PyUnicode_WRITE(okind, odata, osize-1, quote);
12064
12065 for (i = 0, o = 1; i < isize; i++) {
12066 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012067
12068 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012069 if ((ch == quote) || (ch == '\\')) {
12070 PyUnicode_WRITE(okind, odata, o++, '\\');
12071 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012072 continue;
12073 }
12074
Benjamin Peterson29060642009-01-31 22:14:21 +000012075 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012076 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012077 PyUnicode_WRITE(okind, odata, o++, '\\');
12078 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012079 }
12080 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012081 PyUnicode_WRITE(okind, odata, o++, '\\');
12082 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012083 }
12084 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012085 PyUnicode_WRITE(okind, odata, o++, '\\');
12086 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012087 }
12088
12089 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012090 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012091 PyUnicode_WRITE(okind, odata, o++, '\\');
12092 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012093 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12094 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012095 }
12096
Georg Brandl559e5d72008-06-11 18:37:52 +000012097 /* Copy ASCII characters as-is */
12098 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012099 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012100 }
12101
Benjamin Peterson29060642009-01-31 22:14:21 +000012102 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000012103 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012104 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000012105 (categories Z* and C* except ASCII space)
12106 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012107 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012108 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012109 if (ch <= 0xff) {
12110 PyUnicode_WRITE(okind, odata, o++, '\\');
12111 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012112 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12113 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012114 }
12115 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012116 else if (ch >= 0x10000) {
12117 PyUnicode_WRITE(okind, odata, o++, '\\');
12118 PyUnicode_WRITE(okind, odata, o++, 'U');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012119 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12120 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12121 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12122 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12123 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12124 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12125 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12126 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012127 }
12128 /* Map 16-bit characters to '\uxxxx' */
12129 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012130 PyUnicode_WRITE(okind, odata, o++, '\\');
12131 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012132 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12133 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12134 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12135 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012136 }
12137 }
12138 /* Copy characters as-is */
12139 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012140 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012141 }
12142 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012143 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012144 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012145 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012146 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012147}
12148
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012149PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012150 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012151\n\
12152Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012153such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012154arguments start and end are interpreted as in slice notation.\n\
12155\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012156Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012157
12158static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012159unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012160{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012161 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012162 Py_ssize_t start;
12163 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012164 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012165
Jesus Ceaac451502011-04-20 17:09:23 +020012166 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12167 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012168 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012169
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012170 if (PyUnicode_READY(self) == -1)
12171 return NULL;
12172 if (PyUnicode_READY(substring) == -1)
12173 return NULL;
12174
Victor Stinner7931d9a2011-11-04 00:22:48 +010012175 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012176
12177 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012178
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012179 if (result == -2)
12180 return NULL;
12181
Christian Heimes217cfd12007-12-02 14:31:20 +000012182 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012183}
12184
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012185PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012186 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012187\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012188Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012189
12190static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012191unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012192{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012193 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012194 Py_ssize_t start;
12195 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012196 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012197
Jesus Ceaac451502011-04-20 17:09:23 +020012198 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12199 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012200 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012201
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012202 if (PyUnicode_READY(self) == -1)
12203 return NULL;
12204 if (PyUnicode_READY(substring) == -1)
12205 return NULL;
12206
Victor Stinner7931d9a2011-11-04 00:22:48 +010012207 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012208
12209 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012210
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012211 if (result == -2)
12212 return NULL;
12213
Guido van Rossumd57fd912000-03-10 22:53:23 +000012214 if (result < 0) {
12215 PyErr_SetString(PyExc_ValueError, "substring not found");
12216 return NULL;
12217 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012218
Christian Heimes217cfd12007-12-02 14:31:20 +000012219 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012220}
12221
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012222PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012223 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012224\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012225Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012226done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012227
12228static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012229unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012230{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012231 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012232 Py_UCS4 fillchar = ' ';
12233
Victor Stinnere9a29352011-10-01 02:14:59 +020012234 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012235 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012236
Victor Stinnerc4b49542011-12-11 22:44:26 +010012237 if (PyUnicode_READY(self) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012238 return NULL;
12239
Victor Stinnerc4b49542011-12-11 22:44:26 +010012240 if (PyUnicode_GET_LENGTH(self) >= width)
12241 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012242
Victor Stinnerc4b49542011-12-11 22:44:26 +010012243 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012244}
12245
Alexander Belopolsky40018472011-02-26 01:02:56 +000012246PyObject *
12247PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012248{
12249 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012250
Guido van Rossumd57fd912000-03-10 22:53:23 +000012251 s = PyUnicode_FromObject(s);
12252 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012253 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012254 if (sep != NULL) {
12255 sep = PyUnicode_FromObject(sep);
12256 if (sep == NULL) {
12257 Py_DECREF(s);
12258 return NULL;
12259 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012260 }
12261
Victor Stinner9310abb2011-10-05 00:59:23 +020012262 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012263
12264 Py_DECREF(s);
12265 Py_XDECREF(sep);
12266 return result;
12267}
12268
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012269PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012270 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012271\n\
12272Return a list of the words in S, using sep as the\n\
12273delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012274splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012275whitespace string is a separator and empty strings are\n\
12276removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012277
12278static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012279unicode_split(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012280{
12281 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012282 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012283
Martin v. Löwis18e16552006-02-15 17:27:45 +000012284 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012285 return NULL;
12286
12287 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012288 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012289 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012290 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012291 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012292 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012293}
12294
Thomas Wouters477c8d52006-05-27 19:21:47 +000012295PyObject *
12296PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12297{
12298 PyObject* str_obj;
12299 PyObject* sep_obj;
12300 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012301 int kind1, kind2, kind;
12302 void *buf1 = NULL, *buf2 = NULL;
12303 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012304
12305 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020012306 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012307 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012308 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012309 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012310 Py_DECREF(str_obj);
12311 return NULL;
12312 }
12313
Victor Stinner14f8f022011-10-05 20:58:25 +020012314 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012315 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012316 kind = Py_MAX(kind1, kind2);
12317 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012318 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012319 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012320 if (!buf1)
12321 goto onError;
12322 buf2 = PyUnicode_DATA(sep_obj);
12323 if (kind2 != kind)
12324 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12325 if (!buf2)
12326 goto onError;
12327 len1 = PyUnicode_GET_LENGTH(str_obj);
12328 len2 = PyUnicode_GET_LENGTH(sep_obj);
12329
Victor Stinner14f8f022011-10-05 20:58:25 +020012330 switch(PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012331 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012332 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12333 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12334 else
12335 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012336 break;
12337 case PyUnicode_2BYTE_KIND:
12338 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12339 break;
12340 case PyUnicode_4BYTE_KIND:
12341 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12342 break;
12343 default:
12344 assert(0);
12345 out = 0;
12346 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012347
12348 Py_DECREF(sep_obj);
12349 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012350 if (kind1 != kind)
12351 PyMem_Free(buf1);
12352 if (kind2 != kind)
12353 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012354
12355 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012356 onError:
12357 Py_DECREF(sep_obj);
12358 Py_DECREF(str_obj);
12359 if (kind1 != kind && buf1)
12360 PyMem_Free(buf1);
12361 if (kind2 != kind && buf2)
12362 PyMem_Free(buf2);
12363 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012364}
12365
12366
12367PyObject *
12368PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12369{
12370 PyObject* str_obj;
12371 PyObject* sep_obj;
12372 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012373 int kind1, kind2, kind;
12374 void *buf1 = NULL, *buf2 = NULL;
12375 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012376
12377 str_obj = PyUnicode_FromObject(str_in);
12378 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012379 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012380 sep_obj = PyUnicode_FromObject(sep_in);
12381 if (!sep_obj) {
12382 Py_DECREF(str_obj);
12383 return NULL;
12384 }
12385
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012386 kind1 = PyUnicode_KIND(str_in);
12387 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012388 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012389 buf1 = PyUnicode_DATA(str_in);
12390 if (kind1 != kind)
12391 buf1 = _PyUnicode_AsKind(str_in, kind);
12392 if (!buf1)
12393 goto onError;
12394 buf2 = PyUnicode_DATA(sep_obj);
12395 if (kind2 != kind)
12396 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12397 if (!buf2)
12398 goto onError;
12399 len1 = PyUnicode_GET_LENGTH(str_obj);
12400 len2 = PyUnicode_GET_LENGTH(sep_obj);
12401
12402 switch(PyUnicode_KIND(str_in)) {
12403 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012404 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12405 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12406 else
12407 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012408 break;
12409 case PyUnicode_2BYTE_KIND:
12410 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12411 break;
12412 case PyUnicode_4BYTE_KIND:
12413 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12414 break;
12415 default:
12416 assert(0);
12417 out = 0;
12418 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012419
12420 Py_DECREF(sep_obj);
12421 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012422 if (kind1 != kind)
12423 PyMem_Free(buf1);
12424 if (kind2 != kind)
12425 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012426
12427 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012428 onError:
12429 Py_DECREF(sep_obj);
12430 Py_DECREF(str_obj);
12431 if (kind1 != kind && buf1)
12432 PyMem_Free(buf1);
12433 if (kind2 != kind && buf2)
12434 PyMem_Free(buf2);
12435 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012436}
12437
12438PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012439 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012440\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012441Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012442the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012443found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012444
12445static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012446unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012447{
Victor Stinner9310abb2011-10-05 00:59:23 +020012448 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012449}
12450
12451PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012452 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012453\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012454Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012455the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012456separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012457
12458static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012459unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012460{
Victor Stinner9310abb2011-10-05 00:59:23 +020012461 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012462}
12463
Alexander Belopolsky40018472011-02-26 01:02:56 +000012464PyObject *
12465PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012466{
12467 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012468
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012469 s = PyUnicode_FromObject(s);
12470 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012471 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012472 if (sep != NULL) {
12473 sep = PyUnicode_FromObject(sep);
12474 if (sep == NULL) {
12475 Py_DECREF(s);
12476 return NULL;
12477 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012478 }
12479
Victor Stinner9310abb2011-10-05 00:59:23 +020012480 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012481
12482 Py_DECREF(s);
12483 Py_XDECREF(sep);
12484 return result;
12485}
12486
12487PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012488 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012489\n\
12490Return a list of the words in S, using sep as the\n\
12491delimiter string, starting at the end of the string and\n\
12492working to the front. If maxsplit is given, at most maxsplit\n\
12493splits are done. If sep is not specified, any whitespace string\n\
12494is a separator.");
12495
12496static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012497unicode_rsplit(PyObject *self, PyObject *args)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012498{
12499 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012500 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012501
Martin v. Löwis18e16552006-02-15 17:27:45 +000012502 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012503 return NULL;
12504
12505 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012506 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012507 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012508 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012509 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012510 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012511}
12512
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012513PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012514 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012515\n\
12516Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012517Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012518is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012519
12520static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012521unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012522{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012523 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012524 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012525
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012526 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12527 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012528 return NULL;
12529
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012530 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012531}
12532
12533static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012534PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012535{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012536 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012537}
12538
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012539PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012540 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012541\n\
12542Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012543and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012544
12545static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012546unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012547{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012548 return fixup(self, fixswapcase);
12549}
12550
Georg Brandlceee0772007-11-27 23:48:05 +000012551PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012552 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012553\n\
12554Return a translation table usable for str.translate().\n\
12555If there is only one argument, it must be a dictionary mapping Unicode\n\
12556ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012557Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012558If there are two arguments, they must be strings of equal length, and\n\
12559in the resulting dictionary, each character in x will be mapped to the\n\
12560character at the same position in y. If there is a third argument, it\n\
12561must be a string, whose characters will be mapped to None in the result.");
12562
12563static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012564unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012565{
12566 PyObject *x, *y = NULL, *z = NULL;
12567 PyObject *new = NULL, *key, *value;
12568 Py_ssize_t i = 0;
12569 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012570
Georg Brandlceee0772007-11-27 23:48:05 +000012571 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12572 return NULL;
12573 new = PyDict_New();
12574 if (!new)
12575 return NULL;
12576 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012577 int x_kind, y_kind, z_kind;
12578 void *x_data, *y_data, *z_data;
12579
Georg Brandlceee0772007-11-27 23:48:05 +000012580 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012581 if (!PyUnicode_Check(x)) {
12582 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12583 "be a string if there is a second argument");
12584 goto err;
12585 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012586 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012587 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12588 "arguments must have equal length");
12589 goto err;
12590 }
12591 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012592 x_kind = PyUnicode_KIND(x);
12593 y_kind = PyUnicode_KIND(y);
12594 x_data = PyUnicode_DATA(x);
12595 y_data = PyUnicode_DATA(y);
12596 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12597 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12598 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012599 if (!key || !value)
12600 goto err;
12601 res = PyDict_SetItem(new, key, value);
12602 Py_DECREF(key);
12603 Py_DECREF(value);
12604 if (res < 0)
12605 goto err;
12606 }
12607 /* create entries for deleting chars in z */
12608 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012609 z_kind = PyUnicode_KIND(z);
12610 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012611 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012612 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012613 if (!key)
12614 goto err;
12615 res = PyDict_SetItem(new, key, Py_None);
12616 Py_DECREF(key);
12617 if (res < 0)
12618 goto err;
12619 }
12620 }
12621 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012622 int kind;
12623 void *data;
12624
Georg Brandlceee0772007-11-27 23:48:05 +000012625 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012626 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012627 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12628 "to maketrans it must be a dict");
12629 goto err;
12630 }
12631 /* copy entries into the new dict, converting string keys to int keys */
12632 while (PyDict_Next(x, &i, &key, &value)) {
12633 if (PyUnicode_Check(key)) {
12634 /* convert string keys to integer keys */
12635 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012636 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012637 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12638 "table must be of length 1");
12639 goto err;
12640 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012641 kind = PyUnicode_KIND(key);
12642 data = PyUnicode_DATA(key);
12643 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012644 if (!newkey)
12645 goto err;
12646 res = PyDict_SetItem(new, newkey, value);
12647 Py_DECREF(newkey);
12648 if (res < 0)
12649 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012650 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012651 /* just keep integer keys */
12652 if (PyDict_SetItem(new, key, value) < 0)
12653 goto err;
12654 } else {
12655 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12656 "be strings or integers");
12657 goto err;
12658 }
12659 }
12660 }
12661 return new;
12662 err:
12663 Py_DECREF(new);
12664 return NULL;
12665}
12666
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012667PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012668 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012669\n\
12670Return a copy of the string S, where all characters have been mapped\n\
12671through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012672Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012673Unmapped characters are left untouched. Characters mapped to None\n\
12674are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012675
12676static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012677unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012678{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012679 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012680}
12681
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012682PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012683 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012684\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012685Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012686
12687static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012688unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012689{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012690 return fixup(self, fixupper);
12691}
12692
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012693PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012694 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012695\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012696Pad a numeric string S with zeros on the left, to fill a field\n\
12697of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012698
12699static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012700unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012701{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012702 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012703 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012704 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012705 int kind;
12706 void *data;
12707 Py_UCS4 chr;
12708
Martin v. Löwis18e16552006-02-15 17:27:45 +000012709 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012710 return NULL;
12711
Victor Stinnerc4b49542011-12-11 22:44:26 +010012712 if (PyUnicode_READY(self) < 0)
12713 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012714
Victor Stinnerc4b49542011-12-11 22:44:26 +010012715 if (PyUnicode_GET_LENGTH(self) >= width)
12716 return unicode_result_unchanged(self);
12717
12718 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012719
12720 u = pad(self, fill, 0, '0');
12721
Walter Dörwald068325e2002-04-15 13:36:47 +000012722 if (u == NULL)
12723 return NULL;
12724
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012725 kind = PyUnicode_KIND(u);
12726 data = PyUnicode_DATA(u);
12727 chr = PyUnicode_READ(kind, data, fill);
12728
12729 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012730 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012731 PyUnicode_WRITE(kind, data, 0, chr);
12732 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012733 }
12734
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012735 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012736 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012737}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012738
12739#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012740static PyObject *
12741unicode__decimal2ascii(PyObject *self)
12742{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012743 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012744}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012745#endif
12746
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012747PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012748 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012749\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012750Return True if S starts with the specified prefix, False otherwise.\n\
12751With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012752With optional end, stop comparing S at that position.\n\
12753prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012754
12755static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012756unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012757 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012758{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012759 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012760 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012761 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012762 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012763 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012764
Jesus Ceaac451502011-04-20 17:09:23 +020012765 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012766 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012767 if (PyTuple_Check(subobj)) {
12768 Py_ssize_t i;
12769 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012770 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012771 if (substring == NULL)
12772 return NULL;
12773 result = tailmatch(self, substring, start, end, -1);
12774 Py_DECREF(substring);
12775 if (result) {
12776 Py_RETURN_TRUE;
12777 }
12778 }
12779 /* nothing matched */
12780 Py_RETURN_FALSE;
12781 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012782 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012783 if (substring == NULL) {
12784 if (PyErr_ExceptionMatches(PyExc_TypeError))
12785 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12786 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012787 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012788 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012789 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012790 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012791 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012792}
12793
12794
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012795PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012796 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012797\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012798Return True if S ends with the specified suffix, False otherwise.\n\
12799With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012800With optional end, stop comparing S at that position.\n\
12801suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012802
12803static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012804unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012805 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012806{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012807 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012808 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012809 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012810 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012811 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012812
Jesus Ceaac451502011-04-20 17:09:23 +020012813 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012814 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012815 if (PyTuple_Check(subobj)) {
12816 Py_ssize_t i;
12817 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012818 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012819 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012820 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012821 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012822 result = tailmatch(self, substring, start, end, +1);
12823 Py_DECREF(substring);
12824 if (result) {
12825 Py_RETURN_TRUE;
12826 }
12827 }
12828 Py_RETURN_FALSE;
12829 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012830 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012831 if (substring == NULL) {
12832 if (PyErr_ExceptionMatches(PyExc_TypeError))
12833 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12834 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012835 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012836 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012837 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012838 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012839 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012840}
12841
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012842#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012843
12844PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012845 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012846\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012847Return a formatted version of S, using substitutions from args and kwargs.\n\
12848The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012849
Eric Smith27bbca62010-11-04 17:06:58 +000012850PyDoc_STRVAR(format_map__doc__,
12851 "S.format_map(mapping) -> str\n\
12852\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012853Return a formatted version of S, using substitutions from mapping.\n\
12854The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012855
Eric Smith4a7d76d2008-05-30 18:10:19 +000012856static PyObject *
12857unicode__format__(PyObject* self, PyObject* args)
12858{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012859 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012860
12861 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12862 return NULL;
12863
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012864 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012865 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012866 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012867}
12868
Eric Smith8c663262007-08-25 02:26:07 +000012869PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012870 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012871\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012872Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012873
12874static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012875unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012876{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012877 Py_ssize_t size;
12878
12879 /* If it's a compact object, account for base structure +
12880 character data. */
12881 if (PyUnicode_IS_COMPACT_ASCII(v))
12882 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12883 else if (PyUnicode_IS_COMPACT(v))
12884 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012885 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012886 else {
12887 /* If it is a two-block object, account for base object, and
12888 for character block if present. */
12889 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012890 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012891 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012892 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012893 }
12894 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012895 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012896 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012897 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012898 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012899 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012900
12901 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012902}
12903
12904PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012905 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012906
12907static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012908unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012909{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010012910 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012911 if (!copy)
12912 return NULL;
12913 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012914}
12915
Guido van Rossumd57fd912000-03-10 22:53:23 +000012916static PyMethodDef unicode_methods[] = {
12917
12918 /* Order is according to common usage: often used methods should
12919 appear first, since lookup is done sequentially. */
12920
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012921 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012922 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12923 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012924 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012925 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12926 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12927 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12928 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12929 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12930 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12931 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012932 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012933 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12934 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12935 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012936 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012937 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12938 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12939 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012940 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012941 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012942 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012943 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012944 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12945 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12946 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12947 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12948 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12949 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12950 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12951 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12952 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12953 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12954 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12955 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12956 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12957 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012958 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012959 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012960 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012961 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012962 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012963 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012964 {"maketrans", (PyCFunction) unicode_maketrans,
12965 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012966 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012967#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012968 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012969#endif
12970
12971#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012972 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012973 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012974#endif
12975
Benjamin Peterson14339b62009-01-31 16:36:08 +000012976 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012977 {NULL, NULL}
12978};
12979
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012980static PyObject *
12981unicode_mod(PyObject *v, PyObject *w)
12982{
Brian Curtindfc80e32011-08-10 20:28:54 -050012983 if (!PyUnicode_Check(v))
12984 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012985 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012986}
12987
12988static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012989 0, /*nb_add*/
12990 0, /*nb_subtract*/
12991 0, /*nb_multiply*/
12992 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012993};
12994
Guido van Rossumd57fd912000-03-10 22:53:23 +000012995static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012996 (lenfunc) unicode_length, /* sq_length */
12997 PyUnicode_Concat, /* sq_concat */
12998 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12999 (ssizeargfunc) unicode_getitem, /* sq_item */
13000 0, /* sq_slice */
13001 0, /* sq_ass_item */
13002 0, /* sq_ass_slice */
13003 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013004};
13005
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013006static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013007unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013008{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013009 if (PyUnicode_READY(self) == -1)
13010 return NULL;
13011
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013012 if (PyIndex_Check(item)) {
13013 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013014 if (i == -1 && PyErr_Occurred())
13015 return NULL;
13016 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013017 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013018 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013019 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013020 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013021 PyObject *result;
13022 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013023 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013024 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013025
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013026 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013027 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013028 return NULL;
13029 }
13030
13031 if (slicelength <= 0) {
Victor Stinner382955f2011-12-11 21:44:00 +010013032 Py_INCREF(unicode_empty);
13033 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013034 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013035 slicelength == PyUnicode_GET_LENGTH(self)) {
13036 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013037 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013038 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013039 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013040 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013041 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013042 src_kind = PyUnicode_KIND(self);
13043 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013044 if (!PyUnicode_IS_ASCII(self)) {
13045 kind_limit = kind_maxchar_limit(src_kind);
13046 max_char = 0;
13047 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13048 ch = PyUnicode_READ(src_kind, src_data, cur);
13049 if (ch > max_char) {
13050 max_char = ch;
13051 if (max_char >= kind_limit)
13052 break;
13053 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013054 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013055 }
Victor Stinner55c99112011-10-13 01:17:06 +020013056 else
13057 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013058 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013059 if (result == NULL)
13060 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013061 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013062 dest_data = PyUnicode_DATA(result);
13063
13064 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013065 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13066 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013067 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013068 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013069 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013070 } else {
13071 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13072 return NULL;
13073 }
13074}
13075
13076static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013077 (lenfunc)unicode_length, /* mp_length */
13078 (binaryfunc)unicode_subscript, /* mp_subscript */
13079 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013080};
13081
Guido van Rossumd57fd912000-03-10 22:53:23 +000013082
Guido van Rossumd57fd912000-03-10 22:53:23 +000013083/* Helpers for PyUnicode_Format() */
13084
13085static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000013086getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013087{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013088 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013089 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013090 (*p_argidx)++;
13091 if (arglen < 0)
13092 return args;
13093 else
13094 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013095 }
13096 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013097 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013098 return NULL;
13099}
13100
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013101/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013102
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013103static PyObject *
13104formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013105{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013106 char *p;
13107 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013108 double x;
Tim Petersced69f82003-09-16 20:30:58 +000013109
Guido van Rossumd57fd912000-03-10 22:53:23 +000013110 x = PyFloat_AsDouble(v);
13111 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013112 return NULL;
13113
Guido van Rossumd57fd912000-03-10 22:53:23 +000013114 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013115 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013116
Eric Smith0923d1d2009-04-16 20:16:10 +000013117 p = PyOS_double_to_string(x, type, prec,
13118 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013119 if (p == NULL)
13120 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013121 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000013122 PyMem_Free(p);
13123 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013124}
13125
Tim Peters38fd5b62000-09-21 05:43:11 +000013126static PyObject*
13127formatlong(PyObject *val, int flags, int prec, int type)
13128{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013129 char *buf;
13130 int len;
13131 PyObject *str; /* temporary string object. */
13132 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013133
Benjamin Peterson14339b62009-01-31 16:36:08 +000013134 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
13135 if (!str)
13136 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013137 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013138 Py_DECREF(str);
13139 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013140}
13141
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013142static Py_UCS4
13143formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013144{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013145 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013146 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013147 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013148 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013149 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013150 goto onError;
13151 }
13152 else {
13153 /* Integer input truncated to a character */
13154 long x;
13155 x = PyLong_AsLong(v);
13156 if (x == -1 && PyErr_Occurred())
13157 goto onError;
13158
Victor Stinner8faf8212011-12-08 22:14:11 +010013159 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013160 PyErr_SetString(PyExc_OverflowError,
13161 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013162 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013163 }
13164
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013165 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013166 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013167
Benjamin Peterson29060642009-01-31 22:14:21 +000013168 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013169 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013170 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013171 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013172}
13173
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013174static int
13175repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count)
13176{
13177 int r;
13178 assert(count > 0);
13179 assert(PyUnicode_Check(obj));
13180 if (count > 5) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013181 PyObject *repeated = unicode_repeat(obj, count);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013182 if (repeated == NULL)
13183 return -1;
13184 r = _PyAccu_Accumulate(acc, repeated);
13185 Py_DECREF(repeated);
13186 return r;
13187 }
13188 else {
13189 do {
13190 if (_PyAccu_Accumulate(acc, obj))
13191 return -1;
13192 } while (--count);
13193 return 0;
13194 }
13195}
13196
Alexander Belopolsky40018472011-02-26 01:02:56 +000013197PyObject *
13198PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013199{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013200 void *fmt;
13201 int fmtkind;
13202 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013203 int kind;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013204 int r;
13205 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013206 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013207 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013208 PyObject *temp = NULL;
13209 PyObject *second = NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013210 PyObject *uformat;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013211 _PyAccu acc;
13212 static PyObject *plus, *minus, *blank, *zero, *percent;
13213
13214 if (!plus && !(plus = get_latin1_char('+')))
13215 return NULL;
13216 if (!minus && !(minus = get_latin1_char('-')))
13217 return NULL;
13218 if (!blank && !(blank = get_latin1_char(' ')))
13219 return NULL;
13220 if (!zero && !(zero = get_latin1_char('0')))
13221 return NULL;
13222 if (!percent && !(percent = get_latin1_char('%')))
13223 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000013224
Guido van Rossumd57fd912000-03-10 22:53:23 +000013225 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013226 PyErr_BadInternalCall();
13227 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013228 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013229 uformat = PyUnicode_FromObject(format);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013230 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013231 return NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013232 if (_PyAccu_Init(&acc))
13233 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013234 fmt = PyUnicode_DATA(uformat);
13235 fmtkind = PyUnicode_KIND(uformat);
13236 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13237 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013238
Guido van Rossumd57fd912000-03-10 22:53:23 +000013239 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013240 arglen = PyTuple_Size(args);
13241 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013242 }
13243 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013244 arglen = -1;
13245 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013246 }
Christian Heimes90aa7642007-12-19 02:45:37 +000013247 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000013248 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013249 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013250
13251 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013252 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013253 PyObject *nonfmt;
13254 Py_ssize_t nonfmtpos;
13255 nonfmtpos = fmtpos++;
13256 while (fmtcnt >= 0 &&
13257 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13258 fmtpos++;
13259 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013260 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013261 nonfmt = PyUnicode_Substring(uformat, nonfmtpos, fmtpos);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013262 if (nonfmt == NULL)
13263 goto onError;
13264 r = _PyAccu_Accumulate(&acc, nonfmt);
13265 Py_DECREF(nonfmt);
13266 if (r)
13267 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013268 }
13269 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013270 /* Got a format specifier */
13271 int flags = 0;
13272 Py_ssize_t width = -1;
13273 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013274 Py_UCS4 c = '\0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013275 Py_UCS4 fill, sign;
Benjamin Peterson29060642009-01-31 22:14:21 +000013276 int isnumok;
13277 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013278 void *pbuf = NULL;
13279 Py_ssize_t pindex, len;
13280 PyObject *signobj = NULL, *fillobj = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013281
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013282 fmtpos++;
13283 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
13284 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013285 Py_ssize_t keylen;
13286 PyObject *key;
13287 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013288
Benjamin Peterson29060642009-01-31 22:14:21 +000013289 if (dict == NULL) {
13290 PyErr_SetString(PyExc_TypeError,
13291 "format requires a mapping");
13292 goto onError;
13293 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013294 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013295 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013296 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013297 /* Skip over balanced parentheses */
13298 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013299 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013300 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013301 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013302 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013303 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013304 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013305 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013306 if (fmtcnt < 0 || pcount > 0) {
13307 PyErr_SetString(PyExc_ValueError,
13308 "incomplete format key");
13309 goto onError;
13310 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013311 key = PyUnicode_Substring(uformat,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013312 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013313 if (key == NULL)
13314 goto onError;
13315 if (args_owned) {
13316 Py_DECREF(args);
13317 args_owned = 0;
13318 }
13319 args = PyObject_GetItem(dict, key);
13320 Py_DECREF(key);
13321 if (args == NULL) {
13322 goto onError;
13323 }
13324 args_owned = 1;
13325 arglen = -1;
13326 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013327 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013328 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013329 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013330 case '-': flags |= F_LJUST; continue;
13331 case '+': flags |= F_SIGN; continue;
13332 case ' ': flags |= F_BLANK; continue;
13333 case '#': flags |= F_ALT; continue;
13334 case '0': flags |= F_ZERO; continue;
13335 }
13336 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013337 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013338 if (c == '*') {
13339 v = getnextarg(args, arglen, &argidx);
13340 if (v == NULL)
13341 goto onError;
13342 if (!PyLong_Check(v)) {
13343 PyErr_SetString(PyExc_TypeError,
13344 "* wants int");
13345 goto onError;
13346 }
13347 width = PyLong_AsLong(v);
13348 if (width == -1 && PyErr_Occurred())
13349 goto onError;
13350 if (width < 0) {
13351 flags |= F_LJUST;
13352 width = -width;
13353 }
13354 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013355 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013356 }
13357 else if (c >= '0' && c <= '9') {
13358 width = c - '0';
13359 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013360 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013361 if (c < '0' || c > '9')
13362 break;
13363 if ((width*10) / 10 != width) {
13364 PyErr_SetString(PyExc_ValueError,
13365 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013366 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013367 }
13368 width = width*10 + (c - '0');
13369 }
13370 }
13371 if (c == '.') {
13372 prec = 0;
13373 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013374 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013375 if (c == '*') {
13376 v = getnextarg(args, arglen, &argidx);
13377 if (v == NULL)
13378 goto onError;
13379 if (!PyLong_Check(v)) {
13380 PyErr_SetString(PyExc_TypeError,
13381 "* wants int");
13382 goto onError;
13383 }
13384 prec = PyLong_AsLong(v);
13385 if (prec == -1 && PyErr_Occurred())
13386 goto onError;
13387 if (prec < 0)
13388 prec = 0;
13389 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013390 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013391 }
13392 else if (c >= '0' && c <= '9') {
13393 prec = c - '0';
13394 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013395 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013396 if (c < '0' || c > '9')
13397 break;
13398 if ((prec*10) / 10 != prec) {
13399 PyErr_SetString(PyExc_ValueError,
13400 "prec too big");
13401 goto onError;
13402 }
13403 prec = prec*10 + (c - '0');
13404 }
13405 }
13406 } /* prec */
13407 if (fmtcnt >= 0) {
13408 if (c == 'h' || c == 'l' || c == 'L') {
13409 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013410 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013411 }
13412 }
13413 if (fmtcnt < 0) {
13414 PyErr_SetString(PyExc_ValueError,
13415 "incomplete format");
13416 goto onError;
13417 }
13418 if (c != '%') {
13419 v = getnextarg(args, arglen, &argidx);
13420 if (v == NULL)
13421 goto onError;
13422 }
13423 sign = 0;
13424 fill = ' ';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013425 fillobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013426 switch (c) {
13427
13428 case '%':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013429 _PyAccu_Accumulate(&acc, percent);
13430 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013431
13432 case 's':
13433 case 'r':
13434 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000013435 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013436 temp = v;
13437 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013438 }
13439 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013440 if (c == 's')
13441 temp = PyObject_Str(v);
13442 else if (c == 'r')
13443 temp = PyObject_Repr(v);
13444 else
13445 temp = PyObject_ASCII(v);
13446 if (temp == NULL)
13447 goto onError;
13448 if (PyUnicode_Check(temp))
13449 /* nothing to do */;
13450 else {
13451 Py_DECREF(temp);
13452 PyErr_SetString(PyExc_TypeError,
13453 "%s argument has non-string str()");
13454 goto onError;
13455 }
13456 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013457 if (PyUnicode_READY(temp) == -1) {
13458 Py_CLEAR(temp);
13459 goto onError;
13460 }
13461 pbuf = PyUnicode_DATA(temp);
13462 kind = PyUnicode_KIND(temp);
13463 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013464 if (prec >= 0 && len > prec)
13465 len = prec;
13466 break;
13467
13468 case 'i':
13469 case 'd':
13470 case 'u':
13471 case 'o':
13472 case 'x':
13473 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000013474 isnumok = 0;
13475 if (PyNumber_Check(v)) {
13476 PyObject *iobj=NULL;
13477
13478 if (PyLong_Check(v)) {
13479 iobj = v;
13480 Py_INCREF(iobj);
13481 }
13482 else {
13483 iobj = PyNumber_Long(v);
13484 }
13485 if (iobj!=NULL) {
13486 if (PyLong_Check(iobj)) {
13487 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013488 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013489 Py_DECREF(iobj);
13490 if (!temp)
13491 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013492 if (PyUnicode_READY(temp) == -1) {
13493 Py_CLEAR(temp);
13494 goto onError;
13495 }
13496 pbuf = PyUnicode_DATA(temp);
13497 kind = PyUnicode_KIND(temp);
13498 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013499 sign = 1;
13500 }
13501 else {
13502 Py_DECREF(iobj);
13503 }
13504 }
13505 }
13506 if (!isnumok) {
13507 PyErr_Format(PyExc_TypeError,
13508 "%%%c format: a number is required, "
13509 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13510 goto onError;
13511 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013512 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013513 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013514 fillobj = zero;
13515 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013516 break;
13517
13518 case 'e':
13519 case 'E':
13520 case 'f':
13521 case 'F':
13522 case 'g':
13523 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013524 temp = formatfloat(v, flags, prec, c);
13525 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000013526 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013527 if (PyUnicode_READY(temp) == -1) {
13528 Py_CLEAR(temp);
13529 goto onError;
13530 }
13531 pbuf = PyUnicode_DATA(temp);
13532 kind = PyUnicode_KIND(temp);
13533 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013534 sign = 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013535 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013536 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013537 fillobj = zero;
13538 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013539 break;
13540
13541 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013542 {
13543 Py_UCS4 ch = formatchar(v);
13544 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013545 goto onError;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013546 temp = _PyUnicode_FromUCS4(&ch, 1);
13547 if (temp == NULL)
13548 goto onError;
13549 pbuf = PyUnicode_DATA(temp);
13550 kind = PyUnicode_KIND(temp);
13551 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013552 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013553 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013554
13555 default:
13556 PyErr_Format(PyExc_ValueError,
13557 "unsupported format character '%c' (0x%x) "
13558 "at index %zd",
13559 (31<=c && c<=126) ? (char)c : '?',
13560 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013561 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013562 goto onError;
13563 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013564 /* pbuf is initialized here. */
13565 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013566 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013567 if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
13568 signobj = minus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013569 len--;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013570 pindex++;
13571 }
13572 else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
13573 signobj = plus;
13574 len--;
13575 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013576 }
13577 else if (flags & F_SIGN)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013578 signobj = plus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013579 else if (flags & F_BLANK)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013580 signobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013581 else
13582 sign = 0;
13583 }
13584 if (width < len)
13585 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013586 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013587 if (fill != ' ') {
13588 assert(signobj != NULL);
13589 if (_PyAccu_Accumulate(&acc, signobj))
13590 goto onError;
13591 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013592 if (width > len)
13593 width--;
13594 }
13595 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013596 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013597 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013598 if (fill != ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013599 second = get_latin1_char(
13600 PyUnicode_READ(kind, pbuf, pindex + 1));
13601 pindex += 2;
13602 if (second == NULL ||
13603 _PyAccu_Accumulate(&acc, zero) ||
13604 _PyAccu_Accumulate(&acc, second))
13605 goto onError;
13606 Py_CLEAR(second);
Benjamin Peterson29060642009-01-31 22:14:21 +000013607 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013608 width -= 2;
13609 if (width < 0)
13610 width = 0;
13611 len -= 2;
13612 }
13613 if (width > len && !(flags & F_LJUST)) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013614 assert(fillobj != NULL);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013615 if (repeat_accumulate(&acc, fillobj, width - len))
13616 goto onError;
13617 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013618 }
13619 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013620 if (sign) {
13621 assert(signobj != NULL);
13622 if (_PyAccu_Accumulate(&acc, signobj))
13623 goto onError;
13624 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013625 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013626 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13627 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013628 second = get_latin1_char(
13629 PyUnicode_READ(kind, pbuf, pindex + 1));
13630 pindex += 2;
13631 if (second == NULL ||
13632 _PyAccu_Accumulate(&acc, zero) ||
13633 _PyAccu_Accumulate(&acc, second))
13634 goto onError;
13635 Py_CLEAR(second);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013636 }
13637 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013638 /* Copy all characters, preserving len */
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013639 if (temp != NULL) {
13640 assert(pbuf == PyUnicode_DATA(temp));
13641 v = PyUnicode_Substring(temp, pindex, pindex + len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013642 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013643 else {
13644 const char *p = (const char *) pbuf;
13645 assert(pbuf != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013646 p += kind * pindex;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013647 v = PyUnicode_FromKindAndData(kind, p, len);
13648 }
13649 if (v == NULL)
13650 goto onError;
13651 r = _PyAccu_Accumulate(&acc, v);
13652 Py_DECREF(v);
13653 if (r)
13654 goto onError;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013655 if (width > len && repeat_accumulate(&acc, blank, width - len))
13656 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013657 if (dict && (argidx < arglen) && c != '%') {
13658 PyErr_SetString(PyExc_TypeError,
13659 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013660 goto onError;
13661 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013662 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013663 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013664 } /* until end */
13665 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013666 PyErr_SetString(PyExc_TypeError,
13667 "not all arguments converted during string formatting");
13668 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013669 }
13670
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013671 result = _PyAccu_Finish(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013672 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013673 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013674 }
13675 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013676 Py_XDECREF(temp);
13677 Py_XDECREF(second);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013678 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013679
Benjamin Peterson29060642009-01-31 22:14:21 +000013680 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013681 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013682 Py_XDECREF(temp);
13683 Py_XDECREF(second);
13684 _PyAccu_Destroy(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013685 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013686 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013687 }
13688 return NULL;
13689}
13690
Jeremy Hylton938ace62002-07-17 16:30:39 +000013691static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013692unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13693
Tim Peters6d6c1a32001-08-02 04:15:00 +000013694static PyObject *
13695unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13696{
Benjamin Peterson29060642009-01-31 22:14:21 +000013697 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013698 static char *kwlist[] = {"object", "encoding", "errors", 0};
13699 char *encoding = NULL;
13700 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013701
Benjamin Peterson14339b62009-01-31 16:36:08 +000013702 if (type != &PyUnicode_Type)
13703 return unicode_subtype_new(type, args, kwds);
13704 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013705 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013706 return NULL;
Victor Stinner382955f2011-12-11 21:44:00 +010013707 if (x == NULL) {
13708 Py_INCREF(unicode_empty);
13709 return unicode_empty;
13710 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013711 if (encoding == NULL && errors == NULL)
13712 return PyObject_Str(x);
13713 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013714 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013715}
13716
Guido van Rossume023fe02001-08-30 03:12:59 +000013717static PyObject *
13718unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13719{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013720 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013721 Py_ssize_t length, char_size;
13722 int share_wstr, share_utf8;
13723 unsigned int kind;
13724 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013725
Benjamin Peterson14339b62009-01-31 16:36:08 +000013726 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013727
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013728 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013729 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013730 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013731 assert(_PyUnicode_CHECK(unicode));
Victor Stinnere06e1452011-10-04 20:52:31 +020013732 if (PyUnicode_READY(unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013733 return NULL;
13734
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013735 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013736 if (self == NULL) {
13737 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013738 return NULL;
13739 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013740 kind = PyUnicode_KIND(unicode);
13741 length = PyUnicode_GET_LENGTH(unicode);
13742
13743 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013744#ifdef Py_DEBUG
13745 _PyUnicode_HASH(self) = -1;
13746#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013747 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013748#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013749 _PyUnicode_STATE(self).interned = 0;
13750 _PyUnicode_STATE(self).kind = kind;
13751 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013752 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013753 _PyUnicode_STATE(self).ready = 1;
13754 _PyUnicode_WSTR(self) = NULL;
13755 _PyUnicode_UTF8_LENGTH(self) = 0;
13756 _PyUnicode_UTF8(self) = NULL;
13757 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013758 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013759
13760 share_utf8 = 0;
13761 share_wstr = 0;
13762 if (kind == PyUnicode_1BYTE_KIND) {
13763 char_size = 1;
13764 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13765 share_utf8 = 1;
13766 }
13767 else if (kind == PyUnicode_2BYTE_KIND) {
13768 char_size = 2;
13769 if (sizeof(wchar_t) == 2)
13770 share_wstr = 1;
13771 }
13772 else {
13773 assert(kind == PyUnicode_4BYTE_KIND);
13774 char_size = 4;
13775 if (sizeof(wchar_t) == 4)
13776 share_wstr = 1;
13777 }
13778
13779 /* Ensure we won't overflow the length. */
13780 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13781 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013782 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013783 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013784 data = PyObject_MALLOC((length + 1) * char_size);
13785 if (data == NULL) {
13786 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013787 goto onError;
13788 }
13789
Victor Stinnerc3c74152011-10-02 20:39:55 +020013790 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013791 if (share_utf8) {
13792 _PyUnicode_UTF8_LENGTH(self) = length;
13793 _PyUnicode_UTF8(self) = data;
13794 }
13795 if (share_wstr) {
13796 _PyUnicode_WSTR_LENGTH(self) = length;
13797 _PyUnicode_WSTR(self) = (wchar_t *)data;
13798 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013799
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013800 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013801 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013802 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013803#ifdef Py_DEBUG
13804 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13805#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020013806 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013807 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013808
13809onError:
13810 Py_DECREF(unicode);
13811 Py_DECREF(self);
13812 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013813}
13814
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013815PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013816 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013817\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013818Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013819encoding defaults to the current default string encoding.\n\
13820errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013821
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013822static PyObject *unicode_iter(PyObject *seq);
13823
Guido van Rossumd57fd912000-03-10 22:53:23 +000013824PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013825 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013826 "str", /* tp_name */
13827 sizeof(PyUnicodeObject), /* tp_size */
13828 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013829 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013830 (destructor)unicode_dealloc, /* tp_dealloc */
13831 0, /* tp_print */
13832 0, /* tp_getattr */
13833 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013834 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013835 unicode_repr, /* tp_repr */
13836 &unicode_as_number, /* tp_as_number */
13837 &unicode_as_sequence, /* tp_as_sequence */
13838 &unicode_as_mapping, /* tp_as_mapping */
13839 (hashfunc) unicode_hash, /* tp_hash*/
13840 0, /* tp_call*/
13841 (reprfunc) unicode_str, /* tp_str */
13842 PyObject_GenericGetAttr, /* tp_getattro */
13843 0, /* tp_setattro */
13844 0, /* tp_as_buffer */
13845 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013846 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013847 unicode_doc, /* tp_doc */
13848 0, /* tp_traverse */
13849 0, /* tp_clear */
13850 PyUnicode_RichCompare, /* tp_richcompare */
13851 0, /* tp_weaklistoffset */
13852 unicode_iter, /* tp_iter */
13853 0, /* tp_iternext */
13854 unicode_methods, /* tp_methods */
13855 0, /* tp_members */
13856 0, /* tp_getset */
13857 &PyBaseObject_Type, /* tp_base */
13858 0, /* tp_dict */
13859 0, /* tp_descr_get */
13860 0, /* tp_descr_set */
13861 0, /* tp_dictoffset */
13862 0, /* tp_init */
13863 0, /* tp_alloc */
13864 unicode_new, /* tp_new */
13865 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013866};
13867
13868/* Initialize the Unicode implementation */
13869
Victor Stinner3a50e702011-10-18 21:21:00 +020013870int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013871{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013872 int i;
13873
Thomas Wouters477c8d52006-05-27 19:21:47 +000013874 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013875 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013876 0x000A, /* LINE FEED */
13877 0x000D, /* CARRIAGE RETURN */
13878 0x001C, /* FILE SEPARATOR */
13879 0x001D, /* GROUP SEPARATOR */
13880 0x001E, /* RECORD SEPARATOR */
13881 0x0085, /* NEXT LINE */
13882 0x2028, /* LINE SEPARATOR */
13883 0x2029, /* PARAGRAPH SEPARATOR */
13884 };
13885
Fred Drakee4315f52000-05-09 19:53:39 +000013886 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013887 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013888 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013889 Py_FatalError("Can't create empty string");
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010013890 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013891
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013892 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013893 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013894 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013895 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013896
13897 /* initialize the linebreak bloom filter */
13898 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013899 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013900 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013901
13902 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020013903
13904#ifdef HAVE_MBCS
13905 winver.dwOSVersionInfoSize = sizeof(winver);
13906 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
13907 PyErr_SetFromWindowsErr(0);
13908 return -1;
13909 }
13910#endif
13911 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013912}
13913
13914/* Finalize the Unicode implementation */
13915
Christian Heimesa156e092008-02-16 07:38:31 +000013916int
13917PyUnicode_ClearFreeList(void)
13918{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013919 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013920}
13921
Guido van Rossumd57fd912000-03-10 22:53:23 +000013922void
Thomas Wouters78890102000-07-22 19:25:51 +000013923_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013924{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013925 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013926
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013927 Py_XDECREF(unicode_empty);
13928 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013929
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013930 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013931 if (unicode_latin1[i]) {
13932 Py_DECREF(unicode_latin1[i]);
13933 unicode_latin1[i] = NULL;
13934 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013935 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020013936 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000013937 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013938}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013939
Walter Dörwald16807132007-05-25 13:52:07 +000013940void
13941PyUnicode_InternInPlace(PyObject **p)
13942{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013943 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013944 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013945#ifdef Py_DEBUG
13946 assert(s != NULL);
13947 assert(_PyUnicode_CHECK(s));
13948#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013949 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013950 return;
13951#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013952 /* If it's a subclass, we don't really know what putting
13953 it in the interned dict might do. */
13954 if (!PyUnicode_CheckExact(s))
13955 return;
13956 if (PyUnicode_CHECK_INTERNED(s))
13957 return;
13958 if (interned == NULL) {
13959 interned = PyDict_New();
13960 if (interned == NULL) {
13961 PyErr_Clear(); /* Don't leave an exception */
13962 return;
13963 }
13964 }
13965 /* It might be that the GetItem call fails even
13966 though the key is present in the dictionary,
13967 namely when this happens during a stack overflow. */
13968 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010013969 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013970 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000013971
Benjamin Peterson29060642009-01-31 22:14:21 +000013972 if (t) {
13973 Py_INCREF(t);
13974 Py_DECREF(*p);
13975 *p = t;
13976 return;
13977 }
Walter Dörwald16807132007-05-25 13:52:07 +000013978
Benjamin Peterson14339b62009-01-31 16:36:08 +000013979 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010013980 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013981 PyErr_Clear();
13982 PyThreadState_GET()->recursion_critical = 0;
13983 return;
13984 }
13985 PyThreadState_GET()->recursion_critical = 0;
13986 /* The two references in interned are not counted by refcnt.
13987 The deallocator will take care of this */
13988 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013989 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013990}
13991
13992void
13993PyUnicode_InternImmortal(PyObject **p)
13994{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013995 PyUnicode_InternInPlace(p);
13996 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020013997 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013998 Py_INCREF(*p);
13999 }
Walter Dörwald16807132007-05-25 13:52:07 +000014000}
14001
14002PyObject *
14003PyUnicode_InternFromString(const char *cp)
14004{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014005 PyObject *s = PyUnicode_FromString(cp);
14006 if (s == NULL)
14007 return NULL;
14008 PyUnicode_InternInPlace(&s);
14009 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014010}
14011
Alexander Belopolsky40018472011-02-26 01:02:56 +000014012void
14013_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014014{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014015 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014016 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014017 Py_ssize_t i, n;
14018 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014019
Benjamin Peterson14339b62009-01-31 16:36:08 +000014020 if (interned == NULL || !PyDict_Check(interned))
14021 return;
14022 keys = PyDict_Keys(interned);
14023 if (keys == NULL || !PyList_Check(keys)) {
14024 PyErr_Clear();
14025 return;
14026 }
Walter Dörwald16807132007-05-25 13:52:07 +000014027
Benjamin Peterson14339b62009-01-31 16:36:08 +000014028 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14029 detector, interned unicode strings are not forcibly deallocated;
14030 rather, we give them their stolen references back, and then clear
14031 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014032
Benjamin Peterson14339b62009-01-31 16:36:08 +000014033 n = PyList_GET_SIZE(keys);
14034 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014035 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014036 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014037 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014038 if (PyUnicode_READY(s) == -1) {
14039 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014040 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014041 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014042 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014043 case SSTATE_NOT_INTERNED:
14044 /* XXX Shouldn't happen */
14045 break;
14046 case SSTATE_INTERNED_IMMORTAL:
14047 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014048 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014049 break;
14050 case SSTATE_INTERNED_MORTAL:
14051 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014052 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014053 break;
14054 default:
14055 Py_FatalError("Inconsistent interned string state.");
14056 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014057 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014058 }
14059 fprintf(stderr, "total size of all interned strings: "
14060 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14061 "mortal/immortal\n", mortal_size, immortal_size);
14062 Py_DECREF(keys);
14063 PyDict_Clear(interned);
14064 Py_DECREF(interned);
14065 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000014066}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014067
14068
14069/********************* Unicode Iterator **************************/
14070
14071typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014072 PyObject_HEAD
14073 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014074 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014075} unicodeiterobject;
14076
14077static void
14078unicodeiter_dealloc(unicodeiterobject *it)
14079{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014080 _PyObject_GC_UNTRACK(it);
14081 Py_XDECREF(it->it_seq);
14082 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014083}
14084
14085static int
14086unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14087{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014088 Py_VISIT(it->it_seq);
14089 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014090}
14091
14092static PyObject *
14093unicodeiter_next(unicodeiterobject *it)
14094{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014095 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014096
Benjamin Peterson14339b62009-01-31 16:36:08 +000014097 assert(it != NULL);
14098 seq = it->it_seq;
14099 if (seq == NULL)
14100 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014101 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014102
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014103 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14104 int kind = PyUnicode_KIND(seq);
14105 void *data = PyUnicode_DATA(seq);
14106 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14107 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014108 if (item != NULL)
14109 ++it->it_index;
14110 return item;
14111 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014112
Benjamin Peterson14339b62009-01-31 16:36:08 +000014113 Py_DECREF(seq);
14114 it->it_seq = NULL;
14115 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014116}
14117
14118static PyObject *
14119unicodeiter_len(unicodeiterobject *it)
14120{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014121 Py_ssize_t len = 0;
14122 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014123 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014124 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014125}
14126
14127PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14128
14129static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014130 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014131 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014132 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014133};
14134
14135PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014136 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14137 "str_iterator", /* tp_name */
14138 sizeof(unicodeiterobject), /* tp_basicsize */
14139 0, /* tp_itemsize */
14140 /* methods */
14141 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14142 0, /* tp_print */
14143 0, /* tp_getattr */
14144 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014145 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014146 0, /* tp_repr */
14147 0, /* tp_as_number */
14148 0, /* tp_as_sequence */
14149 0, /* tp_as_mapping */
14150 0, /* tp_hash */
14151 0, /* tp_call */
14152 0, /* tp_str */
14153 PyObject_GenericGetAttr, /* tp_getattro */
14154 0, /* tp_setattro */
14155 0, /* tp_as_buffer */
14156 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14157 0, /* tp_doc */
14158 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14159 0, /* tp_clear */
14160 0, /* tp_richcompare */
14161 0, /* tp_weaklistoffset */
14162 PyObject_SelfIter, /* tp_iter */
14163 (iternextfunc)unicodeiter_next, /* tp_iternext */
14164 unicodeiter_methods, /* tp_methods */
14165 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014166};
14167
14168static PyObject *
14169unicode_iter(PyObject *seq)
14170{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014171 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014172
Benjamin Peterson14339b62009-01-31 16:36:08 +000014173 if (!PyUnicode_Check(seq)) {
14174 PyErr_BadInternalCall();
14175 return NULL;
14176 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014177 if (PyUnicode_READY(seq) == -1)
14178 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014179 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14180 if (it == NULL)
14181 return NULL;
14182 it->it_index = 0;
14183 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014184 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014185 _PyObject_GC_TRACK(it);
14186 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014187}
14188
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014189
14190size_t
14191Py_UNICODE_strlen(const Py_UNICODE *u)
14192{
14193 int res = 0;
14194 while(*u++)
14195 res++;
14196 return res;
14197}
14198
14199Py_UNICODE*
14200Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14201{
14202 Py_UNICODE *u = s1;
14203 while ((*u++ = *s2++));
14204 return s1;
14205}
14206
14207Py_UNICODE*
14208Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14209{
14210 Py_UNICODE *u = s1;
14211 while ((*u++ = *s2++))
14212 if (n-- == 0)
14213 break;
14214 return s1;
14215}
14216
14217Py_UNICODE*
14218Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14219{
14220 Py_UNICODE *u1 = s1;
14221 u1 += Py_UNICODE_strlen(u1);
14222 Py_UNICODE_strcpy(u1, s2);
14223 return s1;
14224}
14225
14226int
14227Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14228{
14229 while (*s1 && *s2 && *s1 == *s2)
14230 s1++, s2++;
14231 if (*s1 && *s2)
14232 return (*s1 < *s2) ? -1 : +1;
14233 if (*s1)
14234 return 1;
14235 if (*s2)
14236 return -1;
14237 return 0;
14238}
14239
14240int
14241Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14242{
14243 register Py_UNICODE u1, u2;
14244 for (; n != 0; n--) {
14245 u1 = *s1;
14246 u2 = *s2;
14247 if (u1 != u2)
14248 return (u1 < u2) ? -1 : +1;
14249 if (u1 == '\0')
14250 return 0;
14251 s1++;
14252 s2++;
14253 }
14254 return 0;
14255}
14256
14257Py_UNICODE*
14258Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14259{
14260 const Py_UNICODE *p;
14261 for (p = s; *p; p++)
14262 if (*p == c)
14263 return (Py_UNICODE*)p;
14264 return NULL;
14265}
14266
14267Py_UNICODE*
14268Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14269{
14270 const Py_UNICODE *p;
14271 p = s + Py_UNICODE_strlen(s);
14272 while (p != s) {
14273 p--;
14274 if (*p == c)
14275 return (Py_UNICODE*)p;
14276 }
14277 return NULL;
14278}
Victor Stinner331ea922010-08-10 16:37:20 +000014279
Victor Stinner71133ff2010-09-01 23:43:53 +000014280Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014281PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014282{
Victor Stinner577db2c2011-10-11 22:12:48 +020014283 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014284 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014285
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014286 if (!PyUnicode_Check(unicode)) {
14287 PyErr_BadArgument();
14288 return NULL;
14289 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014290 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014291 if (u == NULL)
14292 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014293 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014294 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014295 PyErr_NoMemory();
14296 return NULL;
14297 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014298 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014299 size *= sizeof(Py_UNICODE);
14300 copy = PyMem_Malloc(size);
14301 if (copy == NULL) {
14302 PyErr_NoMemory();
14303 return NULL;
14304 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014305 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014306 return copy;
14307}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014308
Georg Brandl66c221e2010-10-14 07:04:07 +000014309/* A _string module, to export formatter_parser and formatter_field_name_split
14310 to the string.Formatter class implemented in Python. */
14311
14312static PyMethodDef _string_methods[] = {
14313 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14314 METH_O, PyDoc_STR("split the argument as a field name")},
14315 {"formatter_parser", (PyCFunction) formatter_parser,
14316 METH_O, PyDoc_STR("parse the argument as a format string")},
14317 {NULL, NULL}
14318};
14319
14320static struct PyModuleDef _string_module = {
14321 PyModuleDef_HEAD_INIT,
14322 "_string",
14323 PyDoc_STR("string helper module"),
14324 0,
14325 _string_methods,
14326 NULL,
14327 NULL,
14328 NULL,
14329 NULL
14330};
14331
14332PyMODINIT_FUNC
14333PyInit__string(void)
14334{
14335 return PyModule_Create(&_string_module);
14336}
14337
14338
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014339#ifdef __cplusplus
14340}
14341#endif