blob: ddb7baa2de3568b9d9edae5de718fbf934125539 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Guido van Rossumd57fd912000-03-10 22:53:23 +000049/* Endianness switches; defaults to little endian */
50
51#ifdef WORDS_BIGENDIAN
52# define BYTEORDER_IS_BIG_ENDIAN
53#else
54# define BYTEORDER_IS_LITTLE_ENDIAN
55#endif
56
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000057/* --- Globals ------------------------------------------------------------
58
59 The globals are initialized by the _PyUnicode_Init() API and should
60 not be used before calling that API.
61
62*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000063
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000064
65#ifdef __cplusplus
66extern "C" {
67#endif
68
Victor Stinner8faf8212011-12-08 22:14:11 +010069/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
70#define MAX_UNICODE 0x10ffff
71
Victor Stinner910337b2011-10-03 03:20:16 +020072#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020073# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020074#else
75# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
76#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020077
Victor Stinnere90fe6a2011-10-01 16:48:13 +020078#define _PyUnicode_UTF8(op) \
79 (((PyCompactUnicodeObject*)(op))->utf8)
80#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020081 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020082 assert(PyUnicode_IS_READY(op)), \
83 PyUnicode_IS_COMPACT_ASCII(op) ? \
84 ((char*)((PyASCIIObject*)(op) + 1)) : \
85 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020086#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020087 (((PyCompactUnicodeObject*)(op))->utf8_length)
88#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020089 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020090 assert(PyUnicode_IS_READY(op)), \
91 PyUnicode_IS_COMPACT_ASCII(op) ? \
92 ((PyASCIIObject*)(op))->length : \
93 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020094#define _PyUnicode_WSTR(op) \
95 (((PyASCIIObject*)(op))->wstr)
96#define _PyUnicode_WSTR_LENGTH(op) \
97 (((PyCompactUnicodeObject*)(op))->wstr_length)
98#define _PyUnicode_LENGTH(op) \
99 (((PyASCIIObject *)(op))->length)
100#define _PyUnicode_STATE(op) \
101 (((PyASCIIObject *)(op))->state)
102#define _PyUnicode_HASH(op) \
103 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200104#define _PyUnicode_KIND(op) \
105 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200107#define _PyUnicode_GET_LENGTH(op) \
108 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200109 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200110#define _PyUnicode_DATA_ANY(op) \
111 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200112
Victor Stinner910337b2011-10-03 03:20:16 +0200113#undef PyUnicode_READY
114#define PyUnicode_READY(op) \
115 (assert(_PyUnicode_CHECK(op)), \
116 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200117 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100118 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200119
Victor Stinnerc379ead2011-10-03 12:52:27 +0200120#define _PyUnicode_SHARE_UTF8(op) \
121 (assert(_PyUnicode_CHECK(op)), \
122 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
123 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
124#define _PyUnicode_SHARE_WSTR(op) \
125 (assert(_PyUnicode_CHECK(op)), \
126 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
127
Victor Stinner829c0ad2011-10-03 01:08:02 +0200128/* true if the Unicode object has an allocated UTF-8 memory block
129 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200130#define _PyUnicode_HAS_UTF8_MEMORY(op) \
131 (assert(_PyUnicode_CHECK(op)), \
132 (!PyUnicode_IS_COMPACT_ASCII(op) \
133 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200134 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
135
Victor Stinner03490912011-10-03 23:45:12 +0200136/* true if the Unicode object has an allocated wstr memory block
137 (not shared with other data) */
138#define _PyUnicode_HAS_WSTR_MEMORY(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 (_PyUnicode_WSTR(op) && \
141 (!PyUnicode_IS_READY(op) || \
142 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
143
Victor Stinner910337b2011-10-03 03:20:16 +0200144/* Generic helper macro to convert characters of different types.
145 from_type and to_type have to be valid type names, begin and end
146 are pointers to the source characters which should be of type
147 "from_type *". to is a pointer of type "to_type *" and points to the
148 buffer where the result characters are written to. */
149#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
150 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200151 to_type *_to = (to_type *) to; \
152 const from_type *_iter = (begin); \
153 const from_type *_end = (end); \
154 Py_ssize_t n = (_end) - (_iter); \
155 const from_type *_unrolled_end = \
156 _iter + (n & ~ (Py_ssize_t) 3); \
157 while (_iter < (_unrolled_end)) { \
158 _to[0] = (to_type) _iter[0]; \
159 _to[1] = (to_type) _iter[1]; \
160 _to[2] = (to_type) _iter[2]; \
161 _to[3] = (to_type) _iter[3]; \
162 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200163 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200164 while (_iter < (_end)) \
165 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200166 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200167
Walter Dörwald16807132007-05-25 13:52:07 +0000168/* This dictionary holds all interned unicode strings. Note that references
169 to strings in this dictionary are *not* counted in the string's ob_refcnt.
170 When the interned string reaches a refcnt of 0 the string deallocation
171 function will delete the reference from this dictionary.
172
173 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000174 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000175*/
176static PyObject *interned;
177
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000178/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200179static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000180
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200181/* List of static strings. */
182static _Py_Identifier *static_strings;
183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184/* Single character Unicode strings in the Latin-1 range are being
185 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200186static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000187
Christian Heimes190d79e2008-01-30 11:58:22 +0000188/* Fast detection of the most frequent whitespace characters */
189const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000190 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000191/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000192/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000193/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000194/* case 0x000C: * FORM FEED */
195/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000196 0, 1, 1, 1, 1, 1, 0, 0,
197 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000198/* case 0x001C: * FILE SEPARATOR */
199/* case 0x001D: * GROUP SEPARATOR */
200/* case 0x001E: * RECORD SEPARATOR */
201/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000202 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000203/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000204 1, 0, 0, 0, 0, 0, 0, 0,
205 0, 0, 0, 0, 0, 0, 0, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
207 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000208
Benjamin Peterson14339b62009-01-31 16:36:08 +0000209 0, 0, 0, 0, 0, 0, 0, 0,
210 0, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000217};
218
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200219/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200220static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200221static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200222static void copy_characters(
223 PyObject *to, Py_ssize_t to_start,
224 PyObject *from, Py_ssize_t from_start,
225 Py_ssize_t how_many);
Victor Stinner488fa492011-12-12 00:01:39 +0100226static int unicode_modifiable(PyObject *unicode);
227
Victor Stinnerfe226c02011-10-03 03:52:20 +0200228
Alexander Belopolsky40018472011-02-26 01:02:56 +0000229static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200230unicode_fromascii(const unsigned char *s, Py_ssize_t size);
231static PyObject *
232_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
233static PyObject *
234_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
235static PyObject *
236_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
237
238static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000239unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000240 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100241 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000242 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
243
Alexander Belopolsky40018472011-02-26 01:02:56 +0000244static void
245raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300246 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100247 PyObject *unicode,
248 Py_ssize_t startpos, Py_ssize_t endpos,
249 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000250
Christian Heimes190d79e2008-01-30 11:58:22 +0000251/* Same for linebreaks */
252static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000253 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000254/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000255/* 0x000B, * LINE TABULATION */
256/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000257/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000258 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000259 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000260/* 0x001C, * FILE SEPARATOR */
261/* 0x001D, * GROUP SEPARATOR */
262/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000263 0, 0, 0, 0, 1, 1, 1, 0,
264 0, 0, 0, 0, 0, 0, 0, 0,
265 0, 0, 0, 0, 0, 0, 0, 0,
266 0, 0, 0, 0, 0, 0, 0, 0,
267 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000268
Benjamin Peterson14339b62009-01-31 16:36:08 +0000269 0, 0, 0, 0, 0, 0, 0, 0,
270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000277};
278
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300279/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
280 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000281Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000282PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000283{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000284#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000285 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000286#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000287 /* This is actually an illegal character, so it should
288 not be passed to unichr. */
289 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000290#endif
291}
292
Victor Stinner910337b2011-10-03 03:20:16 +0200293#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200294int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100295_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200296{
297 PyASCIIObject *ascii;
298 unsigned int kind;
299
300 assert(PyUnicode_Check(op));
301
302 ascii = (PyASCIIObject *)op;
303 kind = ascii->state.kind;
304
Victor Stinnera3b334d2011-10-03 13:53:37 +0200305 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200306 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200307 assert(ascii->state.ready == 1);
308 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200309 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200310 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200311 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200312
Victor Stinnera41463c2011-10-04 01:05:08 +0200313 if (ascii->state.compact == 1) {
314 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200315 assert(kind == PyUnicode_1BYTE_KIND
316 || kind == PyUnicode_2BYTE_KIND
317 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200318 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200319 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200320 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100321 }
322 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200323 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
324
325 data = unicode->data.any;
326 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100327 assert(ascii->length == 0);
328 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200329 assert(ascii->state.compact == 0);
330 assert(ascii->state.ascii == 0);
331 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100332 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200333 assert(ascii->wstr != NULL);
334 assert(data == NULL);
335 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200336 }
337 else {
338 assert(kind == PyUnicode_1BYTE_KIND
339 || kind == PyUnicode_2BYTE_KIND
340 || kind == PyUnicode_4BYTE_KIND);
341 assert(ascii->state.compact == 0);
342 assert(ascii->state.ready == 1);
343 assert(data != NULL);
344 if (ascii->state.ascii) {
345 assert (compact->utf8 == data);
346 assert (compact->utf8_length == ascii->length);
347 }
348 else
349 assert (compact->utf8 != data);
350 }
351 }
352 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200353 if (
354#if SIZEOF_WCHAR_T == 2
355 kind == PyUnicode_2BYTE_KIND
356#else
357 kind == PyUnicode_4BYTE_KIND
358#endif
359 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200360 {
361 assert(ascii->wstr == data);
362 assert(compact->wstr_length == ascii->length);
363 } else
364 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200365 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200366
367 if (compact->utf8 == NULL)
368 assert(compact->utf8_length == 0);
369 if (ascii->wstr == NULL)
370 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200371 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200372 /* check that the best kind is used */
373 if (check_content && kind != PyUnicode_WCHAR_KIND)
374 {
375 Py_ssize_t i;
376 Py_UCS4 maxchar = 0;
377 void *data = PyUnicode_DATA(ascii);
378 for (i=0; i < ascii->length; i++)
379 {
380 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
381 if (ch > maxchar)
382 maxchar = ch;
383 }
384 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100385 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200386 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100387 assert(maxchar <= 255);
388 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200389 else
390 assert(maxchar < 128);
391 }
Victor Stinner77faf692011-11-20 18:56:05 +0100392 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200393 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100394 assert(maxchar <= 0xFFFF);
395 }
396 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200397 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100398 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100399 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200400 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400401 return 1;
402}
Victor Stinner910337b2011-10-03 03:20:16 +0200403#endif
404
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100405static PyObject*
406unicode_result_wchar(PyObject *unicode)
407{
408#ifndef Py_DEBUG
409 Py_ssize_t len;
410
411 assert(Py_REFCNT(unicode) == 1);
412
413 len = _PyUnicode_WSTR_LENGTH(unicode);
414 if (len == 0) {
415 Py_INCREF(unicode_empty);
416 Py_DECREF(unicode);
417 return unicode_empty;
418 }
419
420 if (len == 1) {
421 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
422 if (ch < 256) {
423 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
424 Py_DECREF(unicode);
425 return latin1_char;
426 }
427 }
428
429 if (_PyUnicode_Ready(unicode) < 0) {
430 Py_XDECREF(unicode);
431 return NULL;
432 }
433#else
434 /* don't make the result ready in debug mode to ensure that the caller
435 makes the string ready before using it */
436 assert(_PyUnicode_CheckConsistency(unicode, 1));
437#endif
438 return unicode;
439}
440
441static PyObject*
442unicode_result_ready(PyObject *unicode)
443{
444 Py_ssize_t length;
445
446 length = PyUnicode_GET_LENGTH(unicode);
447 if (length == 0) {
448 if (unicode != unicode_empty) {
449 Py_INCREF(unicode_empty);
450 Py_DECREF(unicode);
451 }
452 return unicode_empty;
453 }
454
455 if (length == 1) {
456 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
457 if (ch < 256) {
458 PyObject *latin1_char = unicode_latin1[ch];
459 if (latin1_char != NULL) {
460 if (unicode != latin1_char) {
461 Py_INCREF(latin1_char);
462 Py_DECREF(unicode);
463 }
464 return latin1_char;
465 }
466 else {
467 assert(_PyUnicode_CheckConsistency(unicode, 1));
468 Py_INCREF(unicode);
469 unicode_latin1[ch] = unicode;
470 return unicode;
471 }
472 }
473 }
474
475 assert(_PyUnicode_CheckConsistency(unicode, 1));
476 return unicode;
477}
478
479static PyObject*
480unicode_result(PyObject *unicode)
481{
482 assert(_PyUnicode_CHECK(unicode));
483 if (PyUnicode_IS_READY(unicode))
484 return unicode_result_ready(unicode);
485 else
486 return unicode_result_wchar(unicode);
487}
488
Victor Stinnerc4b49542011-12-11 22:44:26 +0100489static PyObject*
490unicode_result_unchanged(PyObject *unicode)
491{
492 if (PyUnicode_CheckExact(unicode)) {
493 if (PyUnicode_READY(unicode) < 0)
494 return NULL;
495 Py_INCREF(unicode);
496 return unicode;
497 }
498 else
499 /* Subtype -- return genuine unicode string with the same value. */
500 return PyUnicode_Copy(unicode);
501}
502
Victor Stinner3a50e702011-10-18 21:21:00 +0200503#ifdef HAVE_MBCS
504static OSVERSIONINFOEX winver;
505#endif
506
Thomas Wouters477c8d52006-05-27 19:21:47 +0000507/* --- Bloom Filters ----------------------------------------------------- */
508
509/* stuff to implement simple "bloom filters" for Unicode characters.
510 to keep things simple, we use a single bitmask, using the least 5
511 bits from each unicode characters as the bit index. */
512
513/* the linebreak mask is set up by Unicode_Init below */
514
Antoine Pitrouf068f942010-01-13 14:19:12 +0000515#if LONG_BIT >= 128
516#define BLOOM_WIDTH 128
517#elif LONG_BIT >= 64
518#define BLOOM_WIDTH 64
519#elif LONG_BIT >= 32
520#define BLOOM_WIDTH 32
521#else
522#error "LONG_BIT is smaller than 32"
523#endif
524
Thomas Wouters477c8d52006-05-27 19:21:47 +0000525#define BLOOM_MASK unsigned long
526
527static BLOOM_MASK bloom_linebreak;
528
Antoine Pitrouf068f942010-01-13 14:19:12 +0000529#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
530#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000531
Benjamin Peterson29060642009-01-31 22:14:21 +0000532#define BLOOM_LINEBREAK(ch) \
533 ((ch) < 128U ? ascii_linebreak[(ch)] : \
534 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000535
Alexander Belopolsky40018472011-02-26 01:02:56 +0000536Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200537make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000538{
539 /* calculate simple bloom-style bitmask for a given unicode string */
540
Antoine Pitrouf068f942010-01-13 14:19:12 +0000541 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000542 Py_ssize_t i;
543
544 mask = 0;
545 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200546 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000547
548 return mask;
549}
550
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200551#define BLOOM_MEMBER(mask, chr, str) \
552 (BLOOM(mask, chr) \
553 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000554
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200555/* Compilation of templated routines */
556
557#include "stringlib/asciilib.h"
558#include "stringlib/fastsearch.h"
559#include "stringlib/partition.h"
560#include "stringlib/split.h"
561#include "stringlib/count.h"
562#include "stringlib/find.h"
563#include "stringlib/find_max_char.h"
564#include "stringlib/localeutil.h"
565#include "stringlib/undef.h"
566
567#include "stringlib/ucs1lib.h"
568#include "stringlib/fastsearch.h"
569#include "stringlib/partition.h"
570#include "stringlib/split.h"
571#include "stringlib/count.h"
572#include "stringlib/find.h"
573#include "stringlib/find_max_char.h"
574#include "stringlib/localeutil.h"
575#include "stringlib/undef.h"
576
577#include "stringlib/ucs2lib.h"
578#include "stringlib/fastsearch.h"
579#include "stringlib/partition.h"
580#include "stringlib/split.h"
581#include "stringlib/count.h"
582#include "stringlib/find.h"
583#include "stringlib/find_max_char.h"
584#include "stringlib/localeutil.h"
585#include "stringlib/undef.h"
586
587#include "stringlib/ucs4lib.h"
588#include "stringlib/fastsearch.h"
589#include "stringlib/partition.h"
590#include "stringlib/split.h"
591#include "stringlib/count.h"
592#include "stringlib/find.h"
593#include "stringlib/find_max_char.h"
594#include "stringlib/localeutil.h"
595#include "stringlib/undef.h"
596
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200597#include "stringlib/unicodedefs.h"
598#include "stringlib/fastsearch.h"
599#include "stringlib/count.h"
600#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100601#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200602
Guido van Rossumd57fd912000-03-10 22:53:23 +0000603/* --- Unicode Object ----------------------------------------------------- */
604
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200605static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200606fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200607
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200608Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
609 Py_ssize_t size, Py_UCS4 ch,
610 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200611{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200612 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
613
614 switch (kind) {
615 case PyUnicode_1BYTE_KIND:
616 {
617 Py_UCS1 ch1 = (Py_UCS1) ch;
618 if (ch1 == ch)
619 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
620 else
621 return -1;
622 }
623 case PyUnicode_2BYTE_KIND:
624 {
625 Py_UCS2 ch2 = (Py_UCS2) ch;
626 if (ch2 == ch)
627 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
628 else
629 return -1;
630 }
631 case PyUnicode_4BYTE_KIND:
632 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
633 default:
634 assert(0);
635 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200636 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200637}
638
Victor Stinnerfe226c02011-10-03 03:52:20 +0200639static PyObject*
640resize_compact(PyObject *unicode, Py_ssize_t length)
641{
642 Py_ssize_t char_size;
643 Py_ssize_t struct_size;
644 Py_ssize_t new_size;
645 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100646 PyObject *new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200647 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100648 assert(PyUnicode_IS_COMPACT(unicode));
649
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200650 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100651 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200652 struct_size = sizeof(PyASCIIObject);
653 else
654 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200655 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200656
Victor Stinnerfe226c02011-10-03 03:52:20 +0200657 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
Victor Stinner84def372011-12-11 20:04:56 +0100658 Py_DECREF(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200659 PyErr_NoMemory();
660 return NULL;
661 }
662 new_size = (struct_size + (length + 1) * char_size);
663
Victor Stinner84def372011-12-11 20:04:56 +0100664 _Py_DEC_REFTOTAL;
665 _Py_ForgetReference(unicode);
666
667 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
668 if (new_unicode == NULL) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200669 PyObject_Del(unicode);
670 PyErr_NoMemory();
671 return NULL;
672 }
Victor Stinner84def372011-12-11 20:04:56 +0100673 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200674 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100675
Victor Stinnerfe226c02011-10-03 03:52:20 +0200676 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200677 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200678 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100679 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200680 _PyUnicode_WSTR_LENGTH(unicode) = length;
681 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200682 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
683 length, 0);
684 return unicode;
685}
686
Alexander Belopolsky40018472011-02-26 01:02:56 +0000687static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200688resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000689{
Victor Stinner95663112011-10-04 01:03:50 +0200690 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100691 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200692 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200693 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000694
Victor Stinnerfe226c02011-10-03 03:52:20 +0200695 if (PyUnicode_IS_READY(unicode)) {
696 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200697 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200698 void *data;
699
700 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200701 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200702 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
703 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200704
705 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
706 PyErr_NoMemory();
707 return -1;
708 }
709 new_size = (length + 1) * char_size;
710
Victor Stinner7a9105a2011-12-12 00:13:42 +0100711 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
712 {
713 PyObject_DEL(_PyUnicode_UTF8(unicode));
714 _PyUnicode_UTF8(unicode) = NULL;
715 _PyUnicode_UTF8_LENGTH(unicode) = 0;
716 }
717
Victor Stinnerfe226c02011-10-03 03:52:20 +0200718 data = (PyObject *)PyObject_REALLOC(data, new_size);
719 if (data == NULL) {
720 PyErr_NoMemory();
721 return -1;
722 }
723 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200724 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200725 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200726 _PyUnicode_WSTR_LENGTH(unicode) = length;
727 }
728 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200729 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200730 _PyUnicode_UTF8_LENGTH(unicode) = length;
731 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200732 _PyUnicode_LENGTH(unicode) = length;
733 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200734 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200735 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200736 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200737 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200738 }
Victor Stinner95663112011-10-04 01:03:50 +0200739 assert(_PyUnicode_WSTR(unicode) != NULL);
740
741 /* check for integer overflow */
742 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
743 PyErr_NoMemory();
744 return -1;
745 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100746 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200747 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100748 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200749 if (!wstr) {
750 PyErr_NoMemory();
751 return -1;
752 }
753 _PyUnicode_WSTR(unicode) = wstr;
754 _PyUnicode_WSTR(unicode)[length] = 0;
755 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200756 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000757 return 0;
758}
759
Victor Stinnerfe226c02011-10-03 03:52:20 +0200760static PyObject*
761resize_copy(PyObject *unicode, Py_ssize_t length)
762{
763 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100764 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200765 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100766
767 if (PyUnicode_READY(unicode) < 0)
768 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200769
770 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
771 if (copy == NULL)
772 return NULL;
773
774 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200775 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200776 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200777 }
778 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200779 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100780
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200781 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200782 if (w == NULL)
783 return NULL;
784 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
785 copy_length = Py_MIN(copy_length, length);
786 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
787 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200788 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200789 }
790}
791
Guido van Rossumd57fd912000-03-10 22:53:23 +0000792/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000793 Ux0000 terminated; some code (e.g. new_identifier)
794 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000795
796 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000797 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000798
799*/
800
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200801#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200802static int unicode_old_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200803#endif
804
Alexander Belopolsky40018472011-02-26 01:02:56 +0000805static PyUnicodeObject *
806_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000807{
808 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200809 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000810
Thomas Wouters477c8d52006-05-27 19:21:47 +0000811 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000812 if (length == 0 && unicode_empty != NULL) {
813 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200814 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000815 }
816
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000817 /* Ensure we won't overflow the size. */
818 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
819 return (PyUnicodeObject *)PyErr_NoMemory();
820 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200821 if (length < 0) {
822 PyErr_SetString(PyExc_SystemError,
823 "Negative size passed to _PyUnicode_New");
824 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000825 }
826
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200827#ifdef Py_DEBUG
828 ++unicode_old_new_calls;
829#endif
830
831 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
832 if (unicode == NULL)
833 return NULL;
834 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
835 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
836 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000837 PyErr_NoMemory();
838 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000839 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200840
Jeremy Hyltond8082792003-09-16 19:41:39 +0000841 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000842 * the caller fails before initializing str -- unicode_resize()
843 * reads str[0], and the Keep-Alive optimization can keep memory
844 * allocated for str alive across a call to unicode_dealloc(unicode).
845 * We don't want unicode_resize to read uninitialized memory in
846 * that case.
847 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200848 _PyUnicode_WSTR(unicode)[0] = 0;
849 _PyUnicode_WSTR(unicode)[length] = 0;
850 _PyUnicode_WSTR_LENGTH(unicode) = length;
851 _PyUnicode_HASH(unicode) = -1;
852 _PyUnicode_STATE(unicode).interned = 0;
853 _PyUnicode_STATE(unicode).kind = 0;
854 _PyUnicode_STATE(unicode).compact = 0;
855 _PyUnicode_STATE(unicode).ready = 0;
856 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200857 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200858 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200859 _PyUnicode_UTF8(unicode) = NULL;
860 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100861 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000862 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000863
Benjamin Peterson29060642009-01-31 22:14:21 +0000864 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000865 /* XXX UNREF/NEWREF interface should be more symmetrical */
866 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000867 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000868 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000869 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000870}
871
Victor Stinnerf42dc442011-10-02 23:33:16 +0200872static const char*
873unicode_kind_name(PyObject *unicode)
874{
Victor Stinner42dfd712011-10-03 14:41:45 +0200875 /* don't check consistency: unicode_kind_name() is called from
876 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200877 if (!PyUnicode_IS_COMPACT(unicode))
878 {
879 if (!PyUnicode_IS_READY(unicode))
880 return "wstr";
881 switch(PyUnicode_KIND(unicode))
882 {
883 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200884 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200885 return "legacy ascii";
886 else
887 return "legacy latin1";
888 case PyUnicode_2BYTE_KIND:
889 return "legacy UCS2";
890 case PyUnicode_4BYTE_KIND:
891 return "legacy UCS4";
892 default:
893 return "<legacy invalid kind>";
894 }
895 }
896 assert(PyUnicode_IS_READY(unicode));
897 switch(PyUnicode_KIND(unicode))
898 {
899 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200900 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200901 return "ascii";
902 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200903 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200904 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200905 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200906 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200907 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200908 default:
909 return "<invalid compact kind>";
910 }
911}
912
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200913#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200914static int unicode_new_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200915
916/* Functions wrapping macros for use in debugger */
917char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200918 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200919}
920
921void *_PyUnicode_compact_data(void *unicode) {
922 return _PyUnicode_COMPACT_DATA(unicode);
923}
924void *_PyUnicode_data(void *unicode){
925 printf("obj %p\n", unicode);
926 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
927 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
928 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
929 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
930 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
931 return PyUnicode_DATA(unicode);
932}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200933
934void
935_PyUnicode_Dump(PyObject *op)
936{
937 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200938 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
939 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
940 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200941
Victor Stinnera849a4b2011-10-03 12:12:11 +0200942 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200943 {
944 if (ascii->state.ascii)
945 data = (ascii + 1);
946 else
947 data = (compact + 1);
948 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200949 else
950 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200951 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
952
Victor Stinnera849a4b2011-10-03 12:12:11 +0200953 if (ascii->wstr == data)
954 printf("shared ");
955 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200956
Victor Stinnera3b334d2011-10-03 13:53:37 +0200957 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200958 printf(" (%zu), ", compact->wstr_length);
959 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
960 printf("shared ");
961 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200962 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200963 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200964}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200965#endif
966
967PyObject *
968PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
969{
970 PyObject *obj;
971 PyCompactUnicodeObject *unicode;
972 void *data;
973 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200974 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200975 Py_ssize_t char_size;
976 Py_ssize_t struct_size;
977
978 /* Optimization for empty strings */
979 if (size == 0 && unicode_empty != NULL) {
980 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200981 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200982 }
983
984#ifdef Py_DEBUG
985 ++unicode_new_new_calls;
986#endif
987
Victor Stinner9e9d6892011-10-04 01:02:02 +0200988 is_ascii = 0;
989 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200990 struct_size = sizeof(PyCompactUnicodeObject);
991 if (maxchar < 128) {
992 kind_state = PyUnicode_1BYTE_KIND;
993 char_size = 1;
994 is_ascii = 1;
995 struct_size = sizeof(PyASCIIObject);
996 }
997 else if (maxchar < 256) {
998 kind_state = PyUnicode_1BYTE_KIND;
999 char_size = 1;
1000 }
1001 else if (maxchar < 65536) {
1002 kind_state = PyUnicode_2BYTE_KIND;
1003 char_size = 2;
1004 if (sizeof(wchar_t) == 2)
1005 is_sharing = 1;
1006 }
1007 else {
1008 kind_state = PyUnicode_4BYTE_KIND;
1009 char_size = 4;
1010 if (sizeof(wchar_t) == 4)
1011 is_sharing = 1;
1012 }
1013
1014 /* Ensure we won't overflow the size. */
1015 if (size < 0) {
1016 PyErr_SetString(PyExc_SystemError,
1017 "Negative size passed to PyUnicode_New");
1018 return NULL;
1019 }
1020 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1021 return PyErr_NoMemory();
1022
1023 /* Duplicated allocation code from _PyObject_New() instead of a call to
1024 * PyObject_New() so we are able to allocate space for the object and
1025 * it's data buffer.
1026 */
1027 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1028 if (obj == NULL)
1029 return PyErr_NoMemory();
1030 obj = PyObject_INIT(obj, &PyUnicode_Type);
1031 if (obj == NULL)
1032 return NULL;
1033
1034 unicode = (PyCompactUnicodeObject *)obj;
1035 if (is_ascii)
1036 data = ((PyASCIIObject*)obj) + 1;
1037 else
1038 data = unicode + 1;
1039 _PyUnicode_LENGTH(unicode) = size;
1040 _PyUnicode_HASH(unicode) = -1;
1041 _PyUnicode_STATE(unicode).interned = 0;
1042 _PyUnicode_STATE(unicode).kind = kind_state;
1043 _PyUnicode_STATE(unicode).compact = 1;
1044 _PyUnicode_STATE(unicode).ready = 1;
1045 _PyUnicode_STATE(unicode).ascii = is_ascii;
1046 if (is_ascii) {
1047 ((char*)data)[size] = 0;
1048 _PyUnicode_WSTR(unicode) = NULL;
1049 }
1050 else if (kind_state == PyUnicode_1BYTE_KIND) {
1051 ((char*)data)[size] = 0;
1052 _PyUnicode_WSTR(unicode) = NULL;
1053 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001054 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001055 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001056 }
1057 else {
1058 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001059 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001060 if (kind_state == PyUnicode_2BYTE_KIND)
1061 ((Py_UCS2*)data)[size] = 0;
1062 else /* kind_state == PyUnicode_4BYTE_KIND */
1063 ((Py_UCS4*)data)[size] = 0;
1064 if (is_sharing) {
1065 _PyUnicode_WSTR_LENGTH(unicode) = size;
1066 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1067 }
1068 else {
1069 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1070 _PyUnicode_WSTR(unicode) = NULL;
1071 }
1072 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01001073 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001074 return obj;
1075}
1076
1077#if SIZEOF_WCHAR_T == 2
1078/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1079 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001080 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001081
1082 This function assumes that unicode can hold one more code point than wstr
1083 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001084static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001085unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001086 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001087{
1088 const wchar_t *iter;
1089 Py_UCS4 *ucs4_out;
1090
Victor Stinner910337b2011-10-03 03:20:16 +02001091 assert(unicode != NULL);
1092 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001093 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1094 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1095
1096 for (iter = begin; iter < end; ) {
1097 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1098 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001099 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1100 && (iter+1) < end
1101 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001102 {
Victor Stinner551ac952011-11-29 22:58:13 +01001103 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001104 iter += 2;
1105 }
1106 else {
1107 *ucs4_out++ = *iter;
1108 iter++;
1109 }
1110 }
1111 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1112 _PyUnicode_GET_LENGTH(unicode)));
1113
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001114}
1115#endif
1116
Victor Stinnercd9950f2011-10-02 00:34:53 +02001117static int
Victor Stinner488fa492011-12-12 00:01:39 +01001118unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001119{
Victor Stinner488fa492011-12-12 00:01:39 +01001120 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001121 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001122 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001123 return -1;
1124 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001125 return 0;
1126}
1127
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001128static int
1129_copy_characters(PyObject *to, Py_ssize_t to_start,
1130 PyObject *from, Py_ssize_t from_start,
1131 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001132{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001133 unsigned int from_kind, to_kind;
1134 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001135 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001136
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001137 assert(PyUnicode_Check(from));
1138 assert(PyUnicode_Check(to));
1139 assert(PyUnicode_IS_READY(from));
1140 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001141
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001142 assert(PyUnicode_GET_LENGTH(from) >= how_many);
1143 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1144 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001145
Victor Stinnerf5ca1a22011-09-28 23:54:59 +02001146 if (how_many == 0)
1147 return 0;
1148
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001149 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001150 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001151 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001152 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001153
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001154#ifdef Py_DEBUG
1155 if (!check_maxchar
1156 && (from_kind > to_kind
1157 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001158 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001159 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1160 Py_UCS4 ch;
1161 Py_ssize_t i;
1162 for (i=0; i < how_many; i++) {
1163 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1164 assert(ch <= to_maxchar);
1165 }
1166 }
1167#endif
1168 fast = (from_kind == to_kind);
1169 if (check_maxchar
1170 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1171 {
1172 /* deny latin1 => ascii */
1173 fast = 0;
1174 }
1175
1176 if (fast) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001177 Py_MEMCPY((char*)to_data + to_kind * to_start,
1178 (char*)from_data + from_kind * from_start,
1179 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001180 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001181 else if (from_kind == PyUnicode_1BYTE_KIND
1182 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001183 {
1184 _PyUnicode_CONVERT_BYTES(
1185 Py_UCS1, Py_UCS2,
1186 PyUnicode_1BYTE_DATA(from) + from_start,
1187 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1188 PyUnicode_2BYTE_DATA(to) + to_start
1189 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001190 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001191 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001192 && to_kind == PyUnicode_4BYTE_KIND)
1193 {
1194 _PyUnicode_CONVERT_BYTES(
1195 Py_UCS1, Py_UCS4,
1196 PyUnicode_1BYTE_DATA(from) + from_start,
1197 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1198 PyUnicode_4BYTE_DATA(to) + to_start
1199 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001200 }
1201 else if (from_kind == PyUnicode_2BYTE_KIND
1202 && to_kind == PyUnicode_4BYTE_KIND)
1203 {
1204 _PyUnicode_CONVERT_BYTES(
1205 Py_UCS2, Py_UCS4,
1206 PyUnicode_2BYTE_DATA(from) + from_start,
1207 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1208 PyUnicode_4BYTE_DATA(to) + to_start
1209 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001210 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001211 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001212 /* check if max_char(from substring) <= max_char(to) */
1213 if (from_kind > to_kind
1214 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001215 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001216 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001217 /* slow path to check for character overflow */
1218 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001219 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001220 Py_ssize_t i;
1221
Victor Stinner56c161a2011-10-06 02:47:11 +02001222#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001223 for (i=0; i < how_many; i++) {
1224 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001225 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001226 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1227 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001228#else
1229 if (!check_maxchar) {
1230 for (i=0; i < how_many; i++) {
1231 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1232 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1233 }
1234 }
1235 else {
1236 for (i=0; i < how_many; i++) {
1237 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1238 if (ch > to_maxchar)
1239 return 1;
1240 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1241 }
1242 }
1243#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001244 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001245 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001246 assert(0 && "inconsistent state");
1247 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001248 }
1249 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001250 return 0;
1251}
1252
1253static void
1254copy_characters(PyObject *to, Py_ssize_t to_start,
1255 PyObject *from, Py_ssize_t from_start,
1256 Py_ssize_t how_many)
1257{
1258 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1259}
1260
1261Py_ssize_t
1262PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1263 PyObject *from, Py_ssize_t from_start,
1264 Py_ssize_t how_many)
1265{
1266 int err;
1267
1268 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1269 PyErr_BadInternalCall();
1270 return -1;
1271 }
1272
1273 if (PyUnicode_READY(from))
1274 return -1;
1275 if (PyUnicode_READY(to))
1276 return -1;
1277
1278 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1279 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1280 PyErr_Format(PyExc_SystemError,
1281 "Cannot write %zi characters at %zi "
1282 "in a string of %zi characters",
1283 how_many, to_start, PyUnicode_GET_LENGTH(to));
1284 return -1;
1285 }
1286
1287 if (how_many == 0)
1288 return 0;
1289
Victor Stinner488fa492011-12-12 00:01:39 +01001290 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001291 return -1;
1292
1293 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1294 if (err) {
1295 PyErr_Format(PyExc_SystemError,
1296 "Cannot copy %s characters "
1297 "into a string of %s characters",
1298 unicode_kind_name(from),
1299 unicode_kind_name(to));
1300 return -1;
1301 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001302 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001303}
1304
Victor Stinner17222162011-09-28 22:15:37 +02001305/* Find the maximum code point and count the number of surrogate pairs so a
1306 correct string length can be computed before converting a string to UCS4.
1307 This function counts single surrogates as a character and not as a pair.
1308
1309 Return 0 on success, or -1 on error. */
1310static int
1311find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1312 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001313{
1314 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001315 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001316
Victor Stinnerc53be962011-10-02 21:33:54 +02001317 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001318 *num_surrogates = 0;
1319 *maxchar = 0;
1320
1321 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001322#if SIZEOF_WCHAR_T == 2
Victor Stinnerca4f2072011-11-22 03:38:40 +01001323 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1324 && (iter+1) < end
1325 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001326 {
Victor Stinner8faf8212011-12-08 22:14:11 +01001327 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001328 ++(*num_surrogates);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001329 iter += 2;
1330 }
1331 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001332#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001333 {
1334 ch = *iter;
1335 iter++;
1336 }
1337 if (ch > *maxchar) {
1338 *maxchar = ch;
1339 if (*maxchar > MAX_UNICODE) {
1340 PyErr_Format(PyExc_ValueError,
1341 "character U+%x is not in range [U+0000; U+10ffff]",
1342 ch);
1343 return -1;
1344 }
1345 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001346 }
1347 return 0;
1348}
1349
1350#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001351static int unicode_ready_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001352#endif
1353
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001354int
1355_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001356{
1357 wchar_t *end;
1358 Py_UCS4 maxchar = 0;
1359 Py_ssize_t num_surrogates;
1360#if SIZEOF_WCHAR_T == 2
1361 Py_ssize_t length_wo_surrogates;
1362#endif
1363
Georg Brandl7597add2011-10-05 16:36:47 +02001364 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001365 strings were created using _PyObject_New() and where no canonical
1366 representation (the str field) has been set yet aka strings
1367 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001368 assert(_PyUnicode_CHECK(unicode));
1369 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001370 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001371 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001372 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001373 /* Actually, it should neither be interned nor be anything else: */
1374 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001375
1376#ifdef Py_DEBUG
1377 ++unicode_ready_calls;
1378#endif
1379
1380 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001381 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001382 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001383 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001384
1385 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001386 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1387 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001388 PyErr_NoMemory();
1389 return -1;
1390 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001391 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001392 _PyUnicode_WSTR(unicode), end,
1393 PyUnicode_1BYTE_DATA(unicode));
1394 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1395 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1396 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1397 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001398 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001399 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001400 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001401 }
1402 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001403 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001404 _PyUnicode_UTF8(unicode) = NULL;
1405 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001406 }
1407 PyObject_FREE(_PyUnicode_WSTR(unicode));
1408 _PyUnicode_WSTR(unicode) = NULL;
1409 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1410 }
1411 /* In this case we might have to convert down from 4-byte native
1412 wchar_t to 2-byte unicode. */
1413 else if (maxchar < 65536) {
1414 assert(num_surrogates == 0 &&
1415 "FindMaxCharAndNumSurrogatePairs() messed up");
1416
Victor Stinner506f5922011-09-28 22:34:18 +02001417#if SIZEOF_WCHAR_T == 2
1418 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001419 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001420 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1421 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1422 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001423 _PyUnicode_UTF8(unicode) = NULL;
1424 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001425#else
1426 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001427 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001428 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001429 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001430 PyErr_NoMemory();
1431 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001432 }
Victor Stinner506f5922011-09-28 22:34:18 +02001433 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1434 _PyUnicode_WSTR(unicode), end,
1435 PyUnicode_2BYTE_DATA(unicode));
1436 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1437 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1438 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001439 _PyUnicode_UTF8(unicode) = NULL;
1440 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001441 PyObject_FREE(_PyUnicode_WSTR(unicode));
1442 _PyUnicode_WSTR(unicode) = NULL;
1443 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1444#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001445 }
1446 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1447 else {
1448#if SIZEOF_WCHAR_T == 2
1449 /* in case the native representation is 2-bytes, we need to allocate a
1450 new normalized 4-byte version. */
1451 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001452 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1453 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001454 PyErr_NoMemory();
1455 return -1;
1456 }
1457 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1458 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001459 _PyUnicode_UTF8(unicode) = NULL;
1460 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001461 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1462 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001463 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001464 PyObject_FREE(_PyUnicode_WSTR(unicode));
1465 _PyUnicode_WSTR(unicode) = NULL;
1466 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1467#else
1468 assert(num_surrogates == 0);
1469
Victor Stinnerc3c74152011-10-02 20:39:55 +02001470 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001471 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001472 _PyUnicode_UTF8(unicode) = NULL;
1473 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001474 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1475#endif
1476 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1477 }
1478 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001479 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001480 return 0;
1481}
1482
Alexander Belopolsky40018472011-02-26 01:02:56 +00001483static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001484unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001485{
Walter Dörwald16807132007-05-25 13:52:07 +00001486 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001487 case SSTATE_NOT_INTERNED:
1488 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001489
Benjamin Peterson29060642009-01-31 22:14:21 +00001490 case SSTATE_INTERNED_MORTAL:
1491 /* revive dead object temporarily for DelItem */
1492 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001493 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001494 Py_FatalError(
1495 "deletion of interned string failed");
1496 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001497
Benjamin Peterson29060642009-01-31 22:14:21 +00001498 case SSTATE_INTERNED_IMMORTAL:
1499 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001500
Benjamin Peterson29060642009-01-31 22:14:21 +00001501 default:
1502 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001503 }
1504
Victor Stinner03490912011-10-03 23:45:12 +02001505 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001506 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001507 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001508 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001509
1510 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinner7931d9a2011-11-04 00:22:48 +01001511 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001512 }
1513 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001514 if (_PyUnicode_DATA_ANY(unicode))
1515 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Victor Stinner7931d9a2011-11-04 00:22:48 +01001516 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001517 }
1518}
1519
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001520#ifdef Py_DEBUG
1521static int
1522unicode_is_singleton(PyObject *unicode)
1523{
1524 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1525 if (unicode == unicode_empty)
1526 return 1;
1527 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1528 {
1529 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1530 if (ch < 256 && unicode_latin1[ch] == unicode)
1531 return 1;
1532 }
1533 return 0;
1534}
1535#endif
1536
Alexander Belopolsky40018472011-02-26 01:02:56 +00001537static int
Victor Stinner488fa492011-12-12 00:01:39 +01001538unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001539{
Victor Stinner488fa492011-12-12 00:01:39 +01001540 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001541 if (Py_REFCNT(unicode) != 1)
1542 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001543 if (_PyUnicode_HASH(unicode) != -1)
1544 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001545 if (PyUnicode_CHECK_INTERNED(unicode))
1546 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001547 if (!PyUnicode_CheckExact(unicode))
1548 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001549#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001550 /* singleton refcount is greater than 1 */
1551 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001552#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001553 return 1;
1554}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001555
Victor Stinnerfe226c02011-10-03 03:52:20 +02001556static int
1557unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1558{
1559 PyObject *unicode;
1560 Py_ssize_t old_length;
1561
1562 assert(p_unicode != NULL);
1563 unicode = *p_unicode;
1564
1565 assert(unicode != NULL);
1566 assert(PyUnicode_Check(unicode));
1567 assert(0 <= length);
1568
Victor Stinner910337b2011-10-03 03:20:16 +02001569 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001570 old_length = PyUnicode_WSTR_LENGTH(unicode);
1571 else
1572 old_length = PyUnicode_GET_LENGTH(unicode);
1573 if (old_length == length)
1574 return 0;
1575
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001576 if (length == 0) {
1577 Py_DECREF(*p_unicode);
1578 *p_unicode = unicode_empty;
1579 Py_INCREF(*p_unicode);
1580 return 0;
1581 }
1582
Victor Stinner488fa492011-12-12 00:01:39 +01001583 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001584 PyObject *copy = resize_copy(unicode, length);
1585 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001586 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001587 Py_DECREF(*p_unicode);
1588 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001589 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001590 }
1591
Victor Stinnerfe226c02011-10-03 03:52:20 +02001592 if (PyUnicode_IS_COMPACT(unicode)) {
1593 *p_unicode = resize_compact(unicode, length);
1594 if (*p_unicode == NULL)
1595 return -1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001596 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001597 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001598 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001599 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001600}
1601
Alexander Belopolsky40018472011-02-26 01:02:56 +00001602int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001603PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001604{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001605 PyObject *unicode;
1606 if (p_unicode == NULL) {
1607 PyErr_BadInternalCall();
1608 return -1;
1609 }
1610 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001611 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001612 {
1613 PyErr_BadInternalCall();
1614 return -1;
1615 }
1616 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001617}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001618
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001619static int
Victor Stinner0a045ef2011-11-09 00:02:42 +01001620unicode_widen(PyObject **p_unicode, unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001621{
1622 PyObject *result;
1623 assert(PyUnicode_IS_READY(*p_unicode));
1624 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1625 return 0;
1626 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1627 maxchar);
1628 if (result == NULL)
1629 return -1;
1630 PyUnicode_CopyCharacters(result, 0, *p_unicode, 0,
1631 PyUnicode_GET_LENGTH(*p_unicode));
1632 Py_DECREF(*p_unicode);
1633 *p_unicode = result;
1634 return 0;
1635}
1636
1637static int
1638unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1639 Py_UCS4 ch)
1640{
1641 if (unicode_widen(p_unicode, ch) < 0)
1642 return -1;
1643 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1644 PyUnicode_DATA(*p_unicode),
1645 (*pos)++, ch);
1646 return 0;
1647}
1648
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001649static PyObject*
1650get_latin1_char(unsigned char ch)
1651{
Victor Stinnera464fc12011-10-02 20:39:30 +02001652 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001653 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001654 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001655 if (!unicode)
1656 return NULL;
1657 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001658 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001659 unicode_latin1[ch] = unicode;
1660 }
1661 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001662 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001663}
1664
Alexander Belopolsky40018472011-02-26 01:02:56 +00001665PyObject *
1666PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001667{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001668 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001669 Py_UCS4 maxchar = 0;
1670 Py_ssize_t num_surrogates;
1671
1672 if (u == NULL)
1673 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001674
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001675 /* If the Unicode data is known at construction time, we can apply
1676 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001677
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001678 /* Optimization for empty strings */
1679 if (size == 0 && unicode_empty != NULL) {
1680 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001681 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001682 }
Tim Petersced69f82003-09-16 20:30:58 +00001683
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001684 /* Single character Unicode objects in the Latin-1 range are
1685 shared when using this constructor */
1686 if (size == 1 && *u < 256)
1687 return get_latin1_char((unsigned char)*u);
1688
1689 /* If not empty and not single character, copy the Unicode data
1690 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001691 if (find_maxchar_surrogates(u, u + size,
1692 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001693 return NULL;
1694
Victor Stinner8faf8212011-12-08 22:14:11 +01001695 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001696 if (!unicode)
1697 return NULL;
1698
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001699 switch (PyUnicode_KIND(unicode)) {
1700 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001701 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001702 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1703 break;
1704 case PyUnicode_2BYTE_KIND:
1705#if Py_UNICODE_SIZE == 2
1706 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1707#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001708 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001709 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1710#endif
1711 break;
1712 case PyUnicode_4BYTE_KIND:
1713#if SIZEOF_WCHAR_T == 2
1714 /* This is the only case which has to process surrogates, thus
1715 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001716 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001717#else
1718 assert(num_surrogates == 0);
1719 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1720#endif
1721 break;
1722 default:
1723 assert(0 && "Impossible state");
1724 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001725
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001726 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001727}
1728
Alexander Belopolsky40018472011-02-26 01:02:56 +00001729PyObject *
1730PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001731{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001732 if (size < 0) {
1733 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001734 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001735 return NULL;
1736 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001737 if (u != NULL)
1738 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1739 else
1740 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001741}
1742
Alexander Belopolsky40018472011-02-26 01:02:56 +00001743PyObject *
1744PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001745{
1746 size_t size = strlen(u);
1747 if (size > PY_SSIZE_T_MAX) {
1748 PyErr_SetString(PyExc_OverflowError, "input too long");
1749 return NULL;
1750 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001751 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001752}
1753
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001754PyObject *
1755_PyUnicode_FromId(_Py_Identifier *id)
1756{
1757 if (!id->object) {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001758 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1759 strlen(id->string),
1760 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001761 if (!id->object)
1762 return NULL;
1763 PyUnicode_InternInPlace(&id->object);
1764 assert(!id->next);
1765 id->next = static_strings;
1766 static_strings = id;
1767 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001768 return id->object;
1769}
1770
1771void
1772_PyUnicode_ClearStaticStrings()
1773{
1774 _Py_Identifier *i;
1775 for (i = static_strings; i; i = i->next) {
1776 Py_DECREF(i->object);
1777 i->object = NULL;
1778 i->next = NULL;
1779 }
1780}
1781
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001782/* Internal function, don't check maximum character */
1783
Victor Stinnere57b1c02011-09-28 22:20:48 +02001784static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001785unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001786{
Victor Stinner785938e2011-12-11 20:09:03 +01001787 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001788 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001789#ifdef Py_DEBUG
Victor Stinnere6b2d442011-12-11 21:54:30 +01001790 assert(s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001791#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001792 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001793 }
Victor Stinner785938e2011-12-11 20:09:03 +01001794 unicode = PyUnicode_New(size, 127);
1795 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001796 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001797 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1798 assert(_PyUnicode_CheckConsistency(unicode, 1));
1799 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001800}
1801
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001802static Py_UCS4
1803kind_maxchar_limit(unsigned int kind)
1804{
1805 switch(kind) {
1806 case PyUnicode_1BYTE_KIND:
1807 return 0x80;
1808 case PyUnicode_2BYTE_KIND:
1809 return 0x100;
1810 case PyUnicode_4BYTE_KIND:
1811 return 0x10000;
1812 default:
1813 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001814 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001815 }
1816}
1817
Victor Stinner702c7342011-10-05 13:50:52 +02001818static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001819_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001820{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001821 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001822 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001823
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001824 if (size == 0) {
1825 Py_INCREF(unicode_empty);
1826 return unicode_empty;
1827 }
1828 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001829 if (size == 1)
1830 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001831
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001832 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001833 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001834 if (!res)
1835 return NULL;
1836 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001837 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001838 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001839}
1840
Victor Stinnere57b1c02011-09-28 22:20:48 +02001841static PyObject*
1842_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001843{
1844 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001845 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001846
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001847 if (size == 0) {
1848 Py_INCREF(unicode_empty);
1849 return unicode_empty;
1850 }
1851 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001852 if (size == 1 && u[0] < 256)
Victor Stinner4e101002011-10-11 23:27:52 +02001853 return get_latin1_char((unsigned char)u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001854
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001855 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001856 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001857 if (!res)
1858 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001859 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001860 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001861 else {
1862 _PyUnicode_CONVERT_BYTES(
1863 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1864 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001865 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001866 return res;
1867}
1868
Victor Stinnere57b1c02011-09-28 22:20:48 +02001869static PyObject*
1870_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001871{
1872 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001873 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001874
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001875 if (size == 0) {
1876 Py_INCREF(unicode_empty);
1877 return unicode_empty;
1878 }
1879 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001880 if (size == 1 && u[0] < 256)
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001881 return get_latin1_char((unsigned char)u[0]);
1882
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001883 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001884 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001885 if (!res)
1886 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001887 if (max_char < 256)
1888 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1889 PyUnicode_1BYTE_DATA(res));
1890 else if (max_char < 0x10000)
1891 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1892 PyUnicode_2BYTE_DATA(res));
1893 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001894 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001895 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001896 return res;
1897}
1898
1899PyObject*
1900PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1901{
Victor Stinnercfed46e2011-11-22 01:29:14 +01001902 if (size < 0) {
1903 PyErr_SetString(PyExc_ValueError, "size must be positive");
1904 return NULL;
1905 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001906 switch(kind) {
1907 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001908 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001909 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001910 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001911 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001912 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001913 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02001914 PyErr_SetString(PyExc_SystemError, "invalid kind");
1915 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001916 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001917}
1918
Victor Stinner25a4b292011-10-06 12:31:55 +02001919/* Ensure that a string uses the most efficient storage, if it is not the
1920 case: create a new string with of the right kind. Write NULL into *p_unicode
1921 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001922static void
Victor Stinner25a4b292011-10-06 12:31:55 +02001923unicode_adjust_maxchar(PyObject **p_unicode)
1924{
1925 PyObject *unicode, *copy;
1926 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001927 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02001928 unsigned int kind;
1929
1930 assert(p_unicode != NULL);
1931 unicode = *p_unicode;
1932 assert(PyUnicode_IS_READY(unicode));
1933 if (PyUnicode_IS_ASCII(unicode))
1934 return;
1935
1936 len = PyUnicode_GET_LENGTH(unicode);
1937 kind = PyUnicode_KIND(unicode);
1938 if (kind == PyUnicode_1BYTE_KIND) {
1939 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001940 max_char = ucs1lib_find_max_char(u, u + len);
1941 if (max_char >= 128)
1942 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001943 }
1944 else if (kind == PyUnicode_2BYTE_KIND) {
1945 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001946 max_char = ucs2lib_find_max_char(u, u + len);
1947 if (max_char >= 256)
1948 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001949 }
1950 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001951 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02001952 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001953 max_char = ucs4lib_find_max_char(u, u + len);
1954 if (max_char >= 0x10000)
1955 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001956 }
Victor Stinner25a4b292011-10-06 12:31:55 +02001957 copy = PyUnicode_New(len, max_char);
1958 copy_characters(copy, 0, unicode, 0, len);
1959 Py_DECREF(unicode);
1960 *p_unicode = copy;
1961}
1962
Victor Stinner034f6cf2011-09-30 02:26:44 +02001963PyObject*
1964PyUnicode_Copy(PyObject *unicode)
1965{
Victor Stinner87af4f22011-11-21 23:03:47 +01001966 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001967 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001968
Victor Stinner034f6cf2011-09-30 02:26:44 +02001969 if (!PyUnicode_Check(unicode)) {
1970 PyErr_BadInternalCall();
1971 return NULL;
1972 }
1973 if (PyUnicode_READY(unicode))
1974 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001975
Victor Stinner87af4f22011-11-21 23:03:47 +01001976 length = PyUnicode_GET_LENGTH(unicode);
1977 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001978 if (!copy)
1979 return NULL;
1980 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1981
Victor Stinner87af4f22011-11-21 23:03:47 +01001982 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
1983 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001984 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001985 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001986}
1987
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001988
Victor Stinnerbc603d12011-10-02 01:00:40 +02001989/* Widen Unicode objects to larger buffers. Don't write terminating null
1990 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001991
1992void*
1993_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1994{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001995 Py_ssize_t len;
1996 void *result;
1997 unsigned int skind;
1998
1999 if (PyUnicode_READY(s))
2000 return NULL;
2001
2002 len = PyUnicode_GET_LENGTH(s);
2003 skind = PyUnicode_KIND(s);
2004 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002005 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002006 return NULL;
2007 }
2008 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002009 case PyUnicode_2BYTE_KIND:
2010 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2011 if (!result)
2012 return PyErr_NoMemory();
2013 assert(skind == PyUnicode_1BYTE_KIND);
2014 _PyUnicode_CONVERT_BYTES(
2015 Py_UCS1, Py_UCS2,
2016 PyUnicode_1BYTE_DATA(s),
2017 PyUnicode_1BYTE_DATA(s) + len,
2018 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002019 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002020 case PyUnicode_4BYTE_KIND:
2021 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2022 if (!result)
2023 return PyErr_NoMemory();
2024 if (skind == PyUnicode_2BYTE_KIND) {
2025 _PyUnicode_CONVERT_BYTES(
2026 Py_UCS2, Py_UCS4,
2027 PyUnicode_2BYTE_DATA(s),
2028 PyUnicode_2BYTE_DATA(s) + len,
2029 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002030 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002031 else {
2032 assert(skind == PyUnicode_1BYTE_KIND);
2033 _PyUnicode_CONVERT_BYTES(
2034 Py_UCS1, Py_UCS4,
2035 PyUnicode_1BYTE_DATA(s),
2036 PyUnicode_1BYTE_DATA(s) + len,
2037 result);
2038 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002039 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002040 default:
2041 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002042 }
Victor Stinner01698042011-10-04 00:04:26 +02002043 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002044 return NULL;
2045}
2046
2047static Py_UCS4*
2048as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2049 int copy_null)
2050{
2051 int kind;
2052 void *data;
2053 Py_ssize_t len, targetlen;
2054 if (PyUnicode_READY(string) == -1)
2055 return NULL;
2056 kind = PyUnicode_KIND(string);
2057 data = PyUnicode_DATA(string);
2058 len = PyUnicode_GET_LENGTH(string);
2059 targetlen = len;
2060 if (copy_null)
2061 targetlen++;
2062 if (!target) {
2063 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2064 PyErr_NoMemory();
2065 return NULL;
2066 }
2067 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2068 if (!target) {
2069 PyErr_NoMemory();
2070 return NULL;
2071 }
2072 }
2073 else {
2074 if (targetsize < targetlen) {
2075 PyErr_Format(PyExc_SystemError,
2076 "string is longer than the buffer");
2077 if (copy_null && 0 < targetsize)
2078 target[0] = 0;
2079 return NULL;
2080 }
2081 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002082 if (kind == PyUnicode_1BYTE_KIND) {
2083 Py_UCS1 *start = (Py_UCS1 *) data;
2084 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002085 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002086 else if (kind == PyUnicode_2BYTE_KIND) {
2087 Py_UCS2 *start = (Py_UCS2 *) data;
2088 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2089 }
2090 else {
2091 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002092 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002093 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002094 if (copy_null)
2095 target[len] = 0;
2096 return target;
2097}
2098
2099Py_UCS4*
2100PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2101 int copy_null)
2102{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002103 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002104 PyErr_BadInternalCall();
2105 return NULL;
2106 }
2107 return as_ucs4(string, target, targetsize, copy_null);
2108}
2109
2110Py_UCS4*
2111PyUnicode_AsUCS4Copy(PyObject *string)
2112{
2113 return as_ucs4(string, NULL, 0, 1);
2114}
2115
2116#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002117
Alexander Belopolsky40018472011-02-26 01:02:56 +00002118PyObject *
2119PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002120{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002121 if (w == NULL) {
Victor Stinner382955f2011-12-11 21:44:00 +01002122 if (size == 0) {
2123 Py_INCREF(unicode_empty);
2124 return unicode_empty;
2125 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002126 PyErr_BadInternalCall();
2127 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002128 }
2129
Martin v. Löwis790465f2008-04-05 20:41:37 +00002130 if (size == -1) {
2131 size = wcslen(w);
2132 }
2133
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002134 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002135}
2136
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002137#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002138
Walter Dörwald346737f2007-05-31 10:44:43 +00002139static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002140makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2141 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002142{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002143 *fmt++ = '%';
2144 if (width) {
2145 if (zeropad)
2146 *fmt++ = '0';
2147 fmt += sprintf(fmt, "%d", width);
2148 }
2149 if (precision)
2150 fmt += sprintf(fmt, ".%d", precision);
2151 if (longflag)
2152 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002153 else if (longlongflag) {
2154 /* longlongflag should only ever be nonzero on machines with
2155 HAVE_LONG_LONG defined */
2156#ifdef HAVE_LONG_LONG
2157 char *f = PY_FORMAT_LONG_LONG;
2158 while (*f)
2159 *fmt++ = *f++;
2160#else
2161 /* we shouldn't ever get here */
2162 assert(0);
2163 *fmt++ = 'l';
2164#endif
2165 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002166 else if (size_tflag) {
2167 char *f = PY_FORMAT_SIZE_T;
2168 while (*f)
2169 *fmt++ = *f++;
2170 }
2171 *fmt++ = c;
2172 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002173}
2174
Victor Stinner96865452011-03-01 23:44:09 +00002175/* helper for PyUnicode_FromFormatV() */
2176
2177static const char*
2178parse_format_flags(const char *f,
2179 int *p_width, int *p_precision,
2180 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2181{
2182 int width, precision, longflag, longlongflag, size_tflag;
2183
2184 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2185 f++;
2186 width = 0;
2187 while (Py_ISDIGIT((unsigned)*f))
2188 width = (width*10) + *f++ - '0';
2189 precision = 0;
2190 if (*f == '.') {
2191 f++;
2192 while (Py_ISDIGIT((unsigned)*f))
2193 precision = (precision*10) + *f++ - '0';
2194 if (*f == '%') {
2195 /* "%.3%s" => f points to "3" */
2196 f--;
2197 }
2198 }
2199 if (*f == '\0') {
2200 /* bogus format "%.1" => go backward, f points to "1" */
2201 f--;
2202 }
2203 if (p_width != NULL)
2204 *p_width = width;
2205 if (p_precision != NULL)
2206 *p_precision = precision;
2207
2208 /* Handle %ld, %lu, %lld and %llu. */
2209 longflag = 0;
2210 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002211 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002212
2213 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002214 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002215 longflag = 1;
2216 ++f;
2217 }
2218#ifdef HAVE_LONG_LONG
2219 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002220 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002221 longlongflag = 1;
2222 f += 2;
2223 }
2224#endif
2225 }
2226 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002227 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002228 size_tflag = 1;
2229 ++f;
2230 }
2231 if (p_longflag != NULL)
2232 *p_longflag = longflag;
2233 if (p_longlongflag != NULL)
2234 *p_longlongflag = longlongflag;
2235 if (p_size_tflag != NULL)
2236 *p_size_tflag = size_tflag;
2237 return f;
2238}
2239
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002240/* maximum number of characters required for output of %ld. 21 characters
2241 allows for 64-bit integers (in decimal) and an optional sign. */
2242#define MAX_LONG_CHARS 21
2243/* maximum number of characters required for output of %lld.
2244 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2245 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2246#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2247
Walter Dörwaldd2034312007-05-18 16:29:38 +00002248PyObject *
2249PyUnicode_FromFormatV(const char *format, va_list vargs)
2250{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002251 va_list count;
2252 Py_ssize_t callcount = 0;
2253 PyObject **callresults = NULL;
2254 PyObject **callresult = NULL;
2255 Py_ssize_t n = 0;
2256 int width = 0;
2257 int precision = 0;
2258 int zeropad;
2259 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002260 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002261 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002262 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002263 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2264 Py_UCS4 argmaxchar;
2265 Py_ssize_t numbersize = 0;
2266 char *numberresults = NULL;
2267 char *numberresult = NULL;
2268 Py_ssize_t i;
2269 int kind;
2270 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002271
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002272 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002273 /* step 1: count the number of %S/%R/%A/%s format specifications
2274 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2275 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002276 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002277 * also estimate a upper bound for all the number formats in the string,
2278 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002279 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002280 for (f = format; *f; f++) {
2281 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002282 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002283 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2284 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2285 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2286 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002287
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002288 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002289#ifdef HAVE_LONG_LONG
2290 if (longlongflag) {
2291 if (width < MAX_LONG_LONG_CHARS)
2292 width = MAX_LONG_LONG_CHARS;
2293 }
2294 else
2295#endif
2296 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2297 including sign. Decimal takes the most space. This
2298 isn't enough for octal. If a width is specified we
2299 need more (which we allocate later). */
2300 if (width < MAX_LONG_CHARS)
2301 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002302
2303 /* account for the size + '\0' to separate numbers
2304 inside of the numberresults buffer */
2305 numbersize += (width + 1);
2306 }
2307 }
2308 else if ((unsigned char)*f > 127) {
2309 PyErr_Format(PyExc_ValueError,
2310 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2311 "string, got a non-ASCII byte: 0x%02x",
2312 (unsigned char)*f);
2313 return NULL;
2314 }
2315 }
2316 /* step 2: allocate memory for the results of
2317 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2318 if (callcount) {
2319 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2320 if (!callresults) {
2321 PyErr_NoMemory();
2322 return NULL;
2323 }
2324 callresult = callresults;
2325 }
2326 /* step 2.5: allocate memory for the results of formating numbers */
2327 if (numbersize) {
2328 numberresults = PyObject_Malloc(numbersize);
2329 if (!numberresults) {
2330 PyErr_NoMemory();
2331 goto fail;
2332 }
2333 numberresult = numberresults;
2334 }
2335
2336 /* step 3: format numbers and figure out how large a buffer we need */
2337 for (f = format; *f; f++) {
2338 if (*f == '%') {
2339 const char* p;
2340 int longflag;
2341 int longlongflag;
2342 int size_tflag;
2343 int numprinted;
2344
2345 p = f;
2346 zeropad = (f[1] == '0');
2347 f = parse_format_flags(f, &width, &precision,
2348 &longflag, &longlongflag, &size_tflag);
2349 switch (*f) {
2350 case 'c':
2351 {
2352 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002353 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002354 n++;
2355 break;
2356 }
2357 case '%':
2358 n++;
2359 break;
2360 case 'i':
2361 case 'd':
2362 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2363 width, precision, *f);
2364 if (longflag)
2365 numprinted = sprintf(numberresult, fmt,
2366 va_arg(count, long));
2367#ifdef HAVE_LONG_LONG
2368 else if (longlongflag)
2369 numprinted = sprintf(numberresult, fmt,
2370 va_arg(count, PY_LONG_LONG));
2371#endif
2372 else if (size_tflag)
2373 numprinted = sprintf(numberresult, fmt,
2374 va_arg(count, Py_ssize_t));
2375 else
2376 numprinted = sprintf(numberresult, fmt,
2377 va_arg(count, int));
2378 n += numprinted;
2379 /* advance by +1 to skip over the '\0' */
2380 numberresult += (numprinted + 1);
2381 assert(*(numberresult - 1) == '\0');
2382 assert(*(numberresult - 2) != '\0');
2383 assert(numprinted >= 0);
2384 assert(numberresult <= numberresults + numbersize);
2385 break;
2386 case 'u':
2387 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2388 width, precision, 'u');
2389 if (longflag)
2390 numprinted = sprintf(numberresult, fmt,
2391 va_arg(count, unsigned long));
2392#ifdef HAVE_LONG_LONG
2393 else if (longlongflag)
2394 numprinted = sprintf(numberresult, fmt,
2395 va_arg(count, unsigned PY_LONG_LONG));
2396#endif
2397 else if (size_tflag)
2398 numprinted = sprintf(numberresult, fmt,
2399 va_arg(count, size_t));
2400 else
2401 numprinted = sprintf(numberresult, fmt,
2402 va_arg(count, unsigned int));
2403 n += numprinted;
2404 numberresult += (numprinted + 1);
2405 assert(*(numberresult - 1) == '\0');
2406 assert(*(numberresult - 2) != '\0');
2407 assert(numprinted >= 0);
2408 assert(numberresult <= numberresults + numbersize);
2409 break;
2410 case 'x':
2411 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2412 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2413 n += numprinted;
2414 numberresult += (numprinted + 1);
2415 assert(*(numberresult - 1) == '\0');
2416 assert(*(numberresult - 2) != '\0');
2417 assert(numprinted >= 0);
2418 assert(numberresult <= numberresults + numbersize);
2419 break;
2420 case 'p':
2421 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2422 /* %p is ill-defined: ensure leading 0x. */
2423 if (numberresult[1] == 'X')
2424 numberresult[1] = 'x';
2425 else if (numberresult[1] != 'x') {
2426 memmove(numberresult + 2, numberresult,
2427 strlen(numberresult) + 1);
2428 numberresult[0] = '0';
2429 numberresult[1] = 'x';
2430 numprinted += 2;
2431 }
2432 n += numprinted;
2433 numberresult += (numprinted + 1);
2434 assert(*(numberresult - 1) == '\0');
2435 assert(*(numberresult - 2) != '\0');
2436 assert(numprinted >= 0);
2437 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002438 break;
2439 case 's':
2440 {
2441 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002442 const char *s = va_arg(count, const char*);
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002443 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002444 if (!str)
2445 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002446 /* since PyUnicode_DecodeUTF8 returns already flexible
2447 unicode objects, there is no need to call ready on them */
2448 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002449 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002450 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002451 /* Remember the str and switch to the next slot */
2452 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002453 break;
2454 }
2455 case 'U':
2456 {
2457 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002458 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002459 if (PyUnicode_READY(obj) == -1)
2460 goto fail;
2461 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002462 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002463 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002464 break;
2465 }
2466 case 'V':
2467 {
2468 PyObject *obj = va_arg(count, PyObject *);
2469 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002470 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002471 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002472 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002473 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002474 if (PyUnicode_READY(obj) == -1)
2475 goto fail;
2476 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002477 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002478 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002479 *callresult++ = NULL;
2480 }
2481 else {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002482 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002483 if (!str_obj)
2484 goto fail;
Victor Stinnere1335c72011-10-04 20:53:03 +02002485 if (PyUnicode_READY(str_obj)) {
2486 Py_DECREF(str_obj);
2487 goto fail;
2488 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002489 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002490 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002491 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002492 *callresult++ = str_obj;
2493 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002494 break;
2495 }
2496 case 'S':
2497 {
2498 PyObject *obj = va_arg(count, PyObject *);
2499 PyObject *str;
2500 assert(obj);
2501 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002502 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002503 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002504 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002505 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002506 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002507 /* Remember the str and switch to the next slot */
2508 *callresult++ = str;
2509 break;
2510 }
2511 case 'R':
2512 {
2513 PyObject *obj = va_arg(count, PyObject *);
2514 PyObject *repr;
2515 assert(obj);
2516 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002517 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002518 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002519 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002520 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002521 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002522 /* Remember the repr and switch to the next slot */
2523 *callresult++ = repr;
2524 break;
2525 }
2526 case 'A':
2527 {
2528 PyObject *obj = va_arg(count, PyObject *);
2529 PyObject *ascii;
2530 assert(obj);
2531 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002532 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002533 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002534 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002535 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002536 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002537 /* Remember the repr and switch to the next slot */
2538 *callresult++ = ascii;
2539 break;
2540 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002541 default:
2542 /* if we stumble upon an unknown
2543 formatting code, copy the rest of
2544 the format string to the output
2545 string. (we cannot just skip the
2546 code, since there's no way to know
2547 what's in the argument list) */
2548 n += strlen(p);
2549 goto expand;
2550 }
2551 } else
2552 n++;
2553 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002554 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002555 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002556 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002557 we don't have to resize the string.
2558 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002559 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002560 if (!string)
2561 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002562 kind = PyUnicode_KIND(string);
2563 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002564 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002565 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002566
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002567 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002568 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002569 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002570
2571 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002572 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2573 /* checking for == because the last argument could be a empty
2574 string, which causes i to point to end, the assert at the end of
2575 the loop */
2576 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002577
Benjamin Peterson14339b62009-01-31 16:36:08 +00002578 switch (*f) {
2579 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002580 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002581 const int ordinal = va_arg(vargs, int);
2582 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002583 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002584 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002585 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002586 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002587 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002588 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002589 case 'p':
2590 /* unused, since we already have the result */
2591 if (*f == 'p')
2592 (void) va_arg(vargs, void *);
2593 else
2594 (void) va_arg(vargs, int);
2595 /* extract the result from numberresults and append. */
2596 for (; *numberresult; ++i, ++numberresult)
2597 PyUnicode_WRITE(kind, data, i, *numberresult);
2598 /* skip over the separating '\0' */
2599 assert(*numberresult == '\0');
2600 numberresult++;
2601 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002602 break;
2603 case 's':
2604 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002605 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002606 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002607 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002608 size = PyUnicode_GET_LENGTH(*callresult);
2609 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002610 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002611 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002612 /* We're done with the unicode()/repr() => forget it */
2613 Py_DECREF(*callresult);
2614 /* switch to next unicode()/repr() result */
2615 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002616 break;
2617 }
2618 case 'U':
2619 {
2620 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002621 Py_ssize_t size;
2622 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2623 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002624 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002625 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002626 break;
2627 }
2628 case 'V':
2629 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002630 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002631 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002632 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002633 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002634 size = PyUnicode_GET_LENGTH(obj);
2635 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002636 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002637 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002638 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002639 size = PyUnicode_GET_LENGTH(*callresult);
2640 assert(PyUnicode_KIND(*callresult) <=
2641 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002642 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002643 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002644 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002645 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002646 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002647 break;
2648 }
2649 case 'S':
2650 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002651 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002652 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002653 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002654 /* unused, since we already have the result */
2655 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002656 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002657 copy_characters(string, i, *callresult, 0, size);
2658 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002659 /* We're done with the unicode()/repr() => forget it */
2660 Py_DECREF(*callresult);
2661 /* switch to next unicode()/repr() result */
2662 ++callresult;
2663 break;
2664 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002665 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002666 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002667 break;
2668 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002669 for (; *p; ++p, ++i)
2670 PyUnicode_WRITE(kind, data, i, *p);
2671 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002672 goto end;
2673 }
Victor Stinner1205f272010-09-11 00:54:47 +00002674 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002675 else {
2676 assert(i < PyUnicode_GET_LENGTH(string));
2677 PyUnicode_WRITE(kind, data, i++, *f);
2678 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002679 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002680 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002681
Benjamin Peterson29060642009-01-31 22:14:21 +00002682 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002683 if (callresults)
2684 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002685 if (numberresults)
2686 PyObject_Free(numberresults);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002687 return unicode_result(string);
Benjamin Peterson29060642009-01-31 22:14:21 +00002688 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002689 if (callresults) {
2690 PyObject **callresult2 = callresults;
2691 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002692 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002693 ++callresult2;
2694 }
2695 PyObject_Free(callresults);
2696 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002697 if (numberresults)
2698 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002699 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002700}
2701
Walter Dörwaldd2034312007-05-18 16:29:38 +00002702PyObject *
2703PyUnicode_FromFormat(const char *format, ...)
2704{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002705 PyObject* ret;
2706 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002707
2708#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002709 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002710#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002711 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002712#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002713 ret = PyUnicode_FromFormatV(format, vargs);
2714 va_end(vargs);
2715 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002716}
2717
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002718#ifdef HAVE_WCHAR_H
2719
Victor Stinner5593d8a2010-10-02 11:11:27 +00002720/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2721 convert a Unicode object to a wide character string.
2722
Victor Stinnerd88d9832011-09-06 02:00:05 +02002723 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002724 character) required to convert the unicode object. Ignore size argument.
2725
Victor Stinnerd88d9832011-09-06 02:00:05 +02002726 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002727 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002728 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002729static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002730unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002731 wchar_t *w,
2732 Py_ssize_t size)
2733{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002734 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002735 const wchar_t *wstr;
2736
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002737 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002738 if (wstr == NULL)
2739 return -1;
2740
Victor Stinner5593d8a2010-10-02 11:11:27 +00002741 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002742 if (size > res)
2743 size = res + 1;
2744 else
2745 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002746 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002747 return res;
2748 }
2749 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002750 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002751}
2752
2753Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002754PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002755 wchar_t *w,
2756 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002757{
2758 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002759 PyErr_BadInternalCall();
2760 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002761 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002762 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002763}
2764
Victor Stinner137c34c2010-09-29 10:25:54 +00002765wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002766PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002767 Py_ssize_t *size)
2768{
2769 wchar_t* buffer;
2770 Py_ssize_t buflen;
2771
2772 if (unicode == NULL) {
2773 PyErr_BadInternalCall();
2774 return NULL;
2775 }
2776
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002777 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002778 if (buflen == -1)
2779 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002780 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002781 PyErr_NoMemory();
2782 return NULL;
2783 }
2784
Victor Stinner137c34c2010-09-29 10:25:54 +00002785 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2786 if (buffer == NULL) {
2787 PyErr_NoMemory();
2788 return NULL;
2789 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002790 buflen = unicode_aswidechar(unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002791 if (buflen == -1)
2792 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002793 if (size != NULL)
2794 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002795 return buffer;
2796}
2797
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002798#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002799
Alexander Belopolsky40018472011-02-26 01:02:56 +00002800PyObject *
2801PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002802{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002803 PyObject *v;
Victor Stinner8faf8212011-12-08 22:14:11 +01002804 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002805 PyErr_SetString(PyExc_ValueError,
2806 "chr() arg not in range(0x110000)");
2807 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002808 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002809
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002810 if (ordinal < 256)
2811 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002812
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002813 v = PyUnicode_New(1, ordinal);
2814 if (v == NULL)
2815 return NULL;
2816 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002817 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002818 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002819}
2820
Alexander Belopolsky40018472011-02-26 01:02:56 +00002821PyObject *
2822PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002823{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002824 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002825 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002826 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002827 if (PyUnicode_READY(obj))
2828 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002829 Py_INCREF(obj);
2830 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002831 }
2832 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002833 /* For a Unicode subtype that's not a Unicode object,
2834 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002835 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002836 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002837 PyErr_Format(PyExc_TypeError,
2838 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002839 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002840 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002841}
2842
Alexander Belopolsky40018472011-02-26 01:02:56 +00002843PyObject *
2844PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002845 const char *encoding,
2846 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002847{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002848 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002849 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002850
Guido van Rossumd57fd912000-03-10 22:53:23 +00002851 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002852 PyErr_BadInternalCall();
2853 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002854 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002855
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002856 /* Decoding bytes objects is the most common case and should be fast */
2857 if (PyBytes_Check(obj)) {
2858 if (PyBytes_GET_SIZE(obj) == 0) {
2859 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002860 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002861 }
2862 else {
2863 v = PyUnicode_Decode(
2864 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2865 encoding, errors);
2866 }
2867 return v;
2868 }
2869
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002870 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002871 PyErr_SetString(PyExc_TypeError,
2872 "decoding str is not supported");
2873 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002874 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002875
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002876 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2877 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2878 PyErr_Format(PyExc_TypeError,
2879 "coercing to str: need bytes, bytearray "
2880 "or buffer-like object, %.80s found",
2881 Py_TYPE(obj)->tp_name);
2882 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002883 }
Tim Petersced69f82003-09-16 20:30:58 +00002884
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002885 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002886 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002887 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002888 }
Tim Petersced69f82003-09-16 20:30:58 +00002889 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002890 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002891
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002892 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002893 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002894}
2895
Victor Stinner600d3be2010-06-10 12:00:55 +00002896/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002897 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2898 1 on success. */
2899static int
2900normalize_encoding(const char *encoding,
2901 char *lower,
2902 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002903{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002904 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002905 char *l;
2906 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002907
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002908 if (encoding == NULL) {
2909 strcpy(lower, "utf-8");
2910 return 1;
2911 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002912 e = encoding;
2913 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002914 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002915 while (*e) {
2916 if (l == l_end)
2917 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002918 if (Py_ISUPPER(*e)) {
2919 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002920 }
2921 else if (*e == '_') {
2922 *l++ = '-';
2923 e++;
2924 }
2925 else {
2926 *l++ = *e++;
2927 }
2928 }
2929 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002930 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002931}
2932
Alexander Belopolsky40018472011-02-26 01:02:56 +00002933PyObject *
2934PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002935 Py_ssize_t size,
2936 const char *encoding,
2937 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002938{
2939 PyObject *buffer = NULL, *unicode;
2940 Py_buffer info;
2941 char lower[11]; /* Enough for any encoding shortcut */
2942
Fred Drakee4315f52000-05-09 19:53:39 +00002943 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002944 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002945 if ((strcmp(lower, "utf-8") == 0) ||
2946 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002947 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00002948 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002949 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002950 (strcmp(lower, "iso-8859-1") == 0))
2951 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002952#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002953 else if (strcmp(lower, "mbcs") == 0)
2954 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002955#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002956 else if (strcmp(lower, "ascii") == 0)
2957 return PyUnicode_DecodeASCII(s, size, errors);
2958 else if (strcmp(lower, "utf-16") == 0)
2959 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2960 else if (strcmp(lower, "utf-32") == 0)
2961 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2962 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002963
2964 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002965 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002966 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002967 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002968 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002969 if (buffer == NULL)
2970 goto onError;
2971 unicode = PyCodec_Decode(buffer, encoding, errors);
2972 if (unicode == NULL)
2973 goto onError;
2974 if (!PyUnicode_Check(unicode)) {
2975 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002976 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002977 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002978 Py_DECREF(unicode);
2979 goto onError;
2980 }
2981 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002982 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00002983
Benjamin Peterson29060642009-01-31 22:14:21 +00002984 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002985 Py_XDECREF(buffer);
2986 return NULL;
2987}
2988
Alexander Belopolsky40018472011-02-26 01:02:56 +00002989PyObject *
2990PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002991 const char *encoding,
2992 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002993{
2994 PyObject *v;
2995
2996 if (!PyUnicode_Check(unicode)) {
2997 PyErr_BadArgument();
2998 goto onError;
2999 }
3000
3001 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003002 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003003
3004 /* Decode via the codec registry */
3005 v = PyCodec_Decode(unicode, encoding, errors);
3006 if (v == NULL)
3007 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003008 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003009
Benjamin Peterson29060642009-01-31 22:14:21 +00003010 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003011 return NULL;
3012}
3013
Alexander Belopolsky40018472011-02-26 01:02:56 +00003014PyObject *
3015PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003016 const char *encoding,
3017 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003018{
3019 PyObject *v;
3020
3021 if (!PyUnicode_Check(unicode)) {
3022 PyErr_BadArgument();
3023 goto onError;
3024 }
3025
3026 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003027 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003028
3029 /* Decode via the codec registry */
3030 v = PyCodec_Decode(unicode, encoding, errors);
3031 if (v == NULL)
3032 goto onError;
3033 if (!PyUnicode_Check(v)) {
3034 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003035 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003036 Py_TYPE(v)->tp_name);
3037 Py_DECREF(v);
3038 goto onError;
3039 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003040 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003041
Benjamin Peterson29060642009-01-31 22:14:21 +00003042 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003043 return NULL;
3044}
3045
Alexander Belopolsky40018472011-02-26 01:02:56 +00003046PyObject *
3047PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003048 Py_ssize_t size,
3049 const char *encoding,
3050 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003051{
3052 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003053
Guido van Rossumd57fd912000-03-10 22:53:23 +00003054 unicode = PyUnicode_FromUnicode(s, size);
3055 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003056 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003057 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3058 Py_DECREF(unicode);
3059 return v;
3060}
3061
Alexander Belopolsky40018472011-02-26 01:02:56 +00003062PyObject *
3063PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003064 const char *encoding,
3065 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003066{
3067 PyObject *v;
3068
3069 if (!PyUnicode_Check(unicode)) {
3070 PyErr_BadArgument();
3071 goto onError;
3072 }
3073
3074 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003075 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003076
3077 /* Encode via the codec registry */
3078 v = PyCodec_Encode(unicode, encoding, errors);
3079 if (v == NULL)
3080 goto onError;
3081 return v;
3082
Benjamin Peterson29060642009-01-31 22:14:21 +00003083 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003084 return NULL;
3085}
3086
Victor Stinnerad158722010-10-27 00:25:46 +00003087PyObject *
3088PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003089{
Victor Stinner99b95382011-07-04 14:23:54 +02003090#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003091 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003092#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003093 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003094#else
Victor Stinner793b5312011-04-27 00:24:21 +02003095 PyInterpreterState *interp = PyThreadState_GET()->interp;
3096 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3097 cannot use it to encode and decode filenames before it is loaded. Load
3098 the Python codec requires to encode at least its own filename. Use the C
3099 version of the locale codec until the codec registry is initialized and
3100 the Python codec is loaded.
3101
3102 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3103 cannot only rely on it: check also interp->fscodec_initialized for
3104 subinterpreters. */
3105 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003106 return PyUnicode_AsEncodedString(unicode,
3107 Py_FileSystemDefaultEncoding,
3108 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003109 }
3110 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003111 /* locale encoding with surrogateescape */
3112 wchar_t *wchar;
3113 char *bytes;
3114 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00003115 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003116
3117 wchar = PyUnicode_AsWideCharString(unicode, NULL);
3118 if (wchar == NULL)
3119 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003120 bytes = _Py_wchar2char(wchar, &error_pos);
3121 if (bytes == NULL) {
3122 if (error_pos != (size_t)-1) {
3123 char *errmsg = strerror(errno);
3124 PyObject *exc = NULL;
3125 if (errmsg == NULL)
3126 errmsg = "Py_wchar2char() failed";
3127 raise_encode_exception(&exc,
Martin v. Löwis12be46c2011-11-04 19:04:15 +01003128 "filesystemencoding", unicode,
Victor Stinner2f02a512010-11-08 22:43:46 +00003129 error_pos, error_pos+1,
3130 errmsg);
3131 Py_XDECREF(exc);
3132 }
3133 else
3134 PyErr_NoMemory();
3135 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003136 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003137 }
3138 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003139
3140 bytes_obj = PyBytes_FromString(bytes);
3141 PyMem_Free(bytes);
3142 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00003143 }
Victor Stinnerad158722010-10-27 00:25:46 +00003144#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003145}
3146
Alexander Belopolsky40018472011-02-26 01:02:56 +00003147PyObject *
3148PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003149 const char *encoding,
3150 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003151{
3152 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003153 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003154
Guido van Rossumd57fd912000-03-10 22:53:23 +00003155 if (!PyUnicode_Check(unicode)) {
3156 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003157 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003158 }
Fred Drakee4315f52000-05-09 19:53:39 +00003159
Fred Drakee4315f52000-05-09 19:53:39 +00003160 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003161 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003162 if ((strcmp(lower, "utf-8") == 0) ||
3163 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003164 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003165 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003166 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003167 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003168 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003169 }
Victor Stinner37296e82010-06-10 13:36:23 +00003170 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003171 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003172 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003173 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003174#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003175 else if (strcmp(lower, "mbcs") == 0)
3176 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003177#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003178 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003179 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003180 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003181
3182 /* Encode via the codec registry */
3183 v = PyCodec_Encode(unicode, encoding, errors);
3184 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003185 return NULL;
3186
3187 /* The normal path */
3188 if (PyBytes_Check(v))
3189 return v;
3190
3191 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003192 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003193 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003194 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003195
3196 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3197 "encoder %s returned bytearray instead of bytes",
3198 encoding);
3199 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003200 Py_DECREF(v);
3201 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003202 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003203
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003204 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3205 Py_DECREF(v);
3206 return b;
3207 }
3208
3209 PyErr_Format(PyExc_TypeError,
3210 "encoder did not return a bytes object (type=%.400s)",
3211 Py_TYPE(v)->tp_name);
3212 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003213 return NULL;
3214}
3215
Alexander Belopolsky40018472011-02-26 01:02:56 +00003216PyObject *
3217PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003218 const char *encoding,
3219 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003220{
3221 PyObject *v;
3222
3223 if (!PyUnicode_Check(unicode)) {
3224 PyErr_BadArgument();
3225 goto onError;
3226 }
3227
3228 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003229 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003230
3231 /* Encode via the codec registry */
3232 v = PyCodec_Encode(unicode, encoding, errors);
3233 if (v == NULL)
3234 goto onError;
3235 if (!PyUnicode_Check(v)) {
3236 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003237 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003238 Py_TYPE(v)->tp_name);
3239 Py_DECREF(v);
3240 goto onError;
3241 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003242 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003243
Benjamin Peterson29060642009-01-31 22:14:21 +00003244 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003245 return NULL;
3246}
3247
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003248PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003249PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003250 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003251 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3252}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003253
Christian Heimes5894ba72007-11-04 11:43:14 +00003254PyObject*
3255PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3256{
Victor Stinner99b95382011-07-04 14:23:54 +02003257#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003258 return PyUnicode_DecodeMBCS(s, size, NULL);
3259#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003260 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003261#else
Victor Stinner793b5312011-04-27 00:24:21 +02003262 PyInterpreterState *interp = PyThreadState_GET()->interp;
3263 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3264 cannot use it to encode and decode filenames before it is loaded. Load
3265 the Python codec requires to encode at least its own filename. Use the C
3266 version of the locale codec until the codec registry is initialized and
3267 the Python codec is loaded.
3268
3269 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3270 cannot only rely on it: check also interp->fscodec_initialized for
3271 subinterpreters. */
3272 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003273 return PyUnicode_Decode(s, size,
3274 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003275 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003276 }
3277 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003278 /* locale encoding with surrogateescape */
3279 wchar_t *wchar;
3280 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00003281 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003282
3283 if (s[size] != '\0' || size != strlen(s)) {
3284 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3285 return NULL;
3286 }
3287
Victor Stinner168e1172010-10-16 23:16:16 +00003288 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003289 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00003290 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003291
Victor Stinner168e1172010-10-16 23:16:16 +00003292 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003293 PyMem_Free(wchar);
3294 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003295 }
Victor Stinnerad158722010-10-27 00:25:46 +00003296#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003297}
3298
Martin v. Löwis011e8422009-05-05 04:43:17 +00003299
3300int
3301PyUnicode_FSConverter(PyObject* arg, void* addr)
3302{
3303 PyObject *output = NULL;
3304 Py_ssize_t size;
3305 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003306 if (arg == NULL) {
3307 Py_DECREF(*(PyObject**)addr);
3308 return 1;
3309 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003310 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003311 output = arg;
3312 Py_INCREF(output);
3313 }
3314 else {
3315 arg = PyUnicode_FromObject(arg);
3316 if (!arg)
3317 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003318 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003319 Py_DECREF(arg);
3320 if (!output)
3321 return 0;
3322 if (!PyBytes_Check(output)) {
3323 Py_DECREF(output);
3324 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3325 return 0;
3326 }
3327 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003328 size = PyBytes_GET_SIZE(output);
3329 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003330 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003331 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003332 Py_DECREF(output);
3333 return 0;
3334 }
3335 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003336 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003337}
3338
3339
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003340int
3341PyUnicode_FSDecoder(PyObject* arg, void* addr)
3342{
3343 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003344 if (arg == NULL) {
3345 Py_DECREF(*(PyObject**)addr);
3346 return 1;
3347 }
3348 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003349 if (PyUnicode_READY(arg))
3350 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003351 output = arg;
3352 Py_INCREF(output);
3353 }
3354 else {
3355 arg = PyBytes_FromObject(arg);
3356 if (!arg)
3357 return 0;
3358 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3359 PyBytes_GET_SIZE(arg));
3360 Py_DECREF(arg);
3361 if (!output)
3362 return 0;
3363 if (!PyUnicode_Check(output)) {
3364 Py_DECREF(output);
3365 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3366 return 0;
3367 }
3368 }
Victor Stinner065836e2011-10-27 01:56:33 +02003369 if (PyUnicode_READY(output) < 0) {
3370 Py_DECREF(output);
3371 return 0;
3372 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003373 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003374 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003375 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3376 Py_DECREF(output);
3377 return 0;
3378 }
3379 *(PyObject**)addr = output;
3380 return Py_CLEANUP_SUPPORTED;
3381}
3382
3383
Martin v. Löwis5b222132007-06-10 09:51:05 +00003384char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003385PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003386{
Christian Heimesf3863112007-11-22 07:46:41 +00003387 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003388
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003389 if (!PyUnicode_Check(unicode)) {
3390 PyErr_BadArgument();
3391 return NULL;
3392 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003393 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003394 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003395
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003396 if (PyUnicode_UTF8(unicode) == NULL) {
3397 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003398 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3399 if (bytes == NULL)
3400 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003401 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3402 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003403 Py_DECREF(bytes);
3404 return NULL;
3405 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003406 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3407 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3408 PyBytes_AS_STRING(bytes),
3409 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003410 Py_DECREF(bytes);
3411 }
3412
3413 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003414 *psize = PyUnicode_UTF8_LENGTH(unicode);
3415 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003416}
3417
3418char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003419PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003420{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003421 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3422}
3423
3424#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02003425static int unicode_as_unicode_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003426#endif
3427
3428
3429Py_UNICODE *
3430PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3431{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003432 const unsigned char *one_byte;
3433#if SIZEOF_WCHAR_T == 4
3434 const Py_UCS2 *two_bytes;
3435#else
3436 const Py_UCS4 *four_bytes;
3437 const Py_UCS4 *ucs4_end;
3438 Py_ssize_t num_surrogates;
3439#endif
3440 wchar_t *w;
3441 wchar_t *wchar_end;
3442
3443 if (!PyUnicode_Check(unicode)) {
3444 PyErr_BadArgument();
3445 return NULL;
3446 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003447 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003448 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003449 assert(_PyUnicode_KIND(unicode) != 0);
3450 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003451
3452#ifdef Py_DEBUG
3453 ++unicode_as_unicode_calls;
3454#endif
3455
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003456 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003457#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003458 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3459 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003460 num_surrogates = 0;
3461
3462 for (; four_bytes < ucs4_end; ++four_bytes) {
3463 if (*four_bytes > 0xFFFF)
3464 ++num_surrogates;
3465 }
3466
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003467 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3468 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3469 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003470 PyErr_NoMemory();
3471 return NULL;
3472 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003473 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003474
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003475 w = _PyUnicode_WSTR(unicode);
3476 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3477 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003478 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3479 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003480 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003481 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003482 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3483 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003484 }
3485 else
3486 *w = *four_bytes;
3487
3488 if (w > wchar_end) {
3489 assert(0 && "Miscalculated string end");
3490 }
3491 }
3492 *w = 0;
3493#else
3494 /* sizeof(wchar_t) == 4 */
3495 Py_FatalError("Impossible unicode object state, wstr and str "
3496 "should share memory already.");
3497 return NULL;
3498#endif
3499 }
3500 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003501 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3502 (_PyUnicode_LENGTH(unicode) + 1));
3503 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003504 PyErr_NoMemory();
3505 return NULL;
3506 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003507 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3508 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3509 w = _PyUnicode_WSTR(unicode);
3510 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003511
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003512 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3513 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003514 for (; w < wchar_end; ++one_byte, ++w)
3515 *w = *one_byte;
3516 /* null-terminate the wstr */
3517 *w = 0;
3518 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003519 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003520#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003521 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003522 for (; w < wchar_end; ++two_bytes, ++w)
3523 *w = *two_bytes;
3524 /* null-terminate the wstr */
3525 *w = 0;
3526#else
3527 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003528 PyObject_FREE(_PyUnicode_WSTR(unicode));
3529 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003530 Py_FatalError("Impossible unicode object state, wstr "
3531 "and str should share memory already.");
3532 return NULL;
3533#endif
3534 }
3535 else {
3536 assert(0 && "This should never happen.");
3537 }
3538 }
3539 }
3540 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003541 *size = PyUnicode_WSTR_LENGTH(unicode);
3542 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003543}
3544
Alexander Belopolsky40018472011-02-26 01:02:56 +00003545Py_UNICODE *
3546PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003547{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003548 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003549}
3550
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003551
Alexander Belopolsky40018472011-02-26 01:02:56 +00003552Py_ssize_t
3553PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003554{
3555 if (!PyUnicode_Check(unicode)) {
3556 PyErr_BadArgument();
3557 goto onError;
3558 }
3559 return PyUnicode_GET_SIZE(unicode);
3560
Benjamin Peterson29060642009-01-31 22:14:21 +00003561 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003562 return -1;
3563}
3564
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003565Py_ssize_t
3566PyUnicode_GetLength(PyObject *unicode)
3567{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003568 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003569 PyErr_BadArgument();
3570 return -1;
3571 }
3572
3573 return PyUnicode_GET_LENGTH(unicode);
3574}
3575
3576Py_UCS4
3577PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3578{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003579 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3580 PyErr_BadArgument();
3581 return (Py_UCS4)-1;
3582 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003583 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003584 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003585 return (Py_UCS4)-1;
3586 }
3587 return PyUnicode_READ_CHAR(unicode, index);
3588}
3589
3590int
3591PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3592{
3593 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003594 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003595 return -1;
3596 }
Victor Stinner488fa492011-12-12 00:01:39 +01003597 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003598 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003599 PyErr_SetString(PyExc_IndexError, "string index out of range");
3600 return -1;
3601 }
Victor Stinner488fa492011-12-12 00:01:39 +01003602 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003603 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003604 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3605 index, ch);
3606 return 0;
3607}
3608
Alexander Belopolsky40018472011-02-26 01:02:56 +00003609const char *
3610PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003611{
Victor Stinner42cb4622010-09-01 19:39:01 +00003612 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003613}
3614
Victor Stinner554f3f02010-06-16 23:33:54 +00003615/* create or adjust a UnicodeDecodeError */
3616static void
3617make_decode_exception(PyObject **exceptionObject,
3618 const char *encoding,
3619 const char *input, Py_ssize_t length,
3620 Py_ssize_t startpos, Py_ssize_t endpos,
3621 const char *reason)
3622{
3623 if (*exceptionObject == NULL) {
3624 *exceptionObject = PyUnicodeDecodeError_Create(
3625 encoding, input, length, startpos, endpos, reason);
3626 }
3627 else {
3628 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3629 goto onError;
3630 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3631 goto onError;
3632 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3633 goto onError;
3634 }
3635 return;
3636
3637onError:
3638 Py_DECREF(*exceptionObject);
3639 *exceptionObject = NULL;
3640}
3641
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003642/* error handling callback helper:
3643 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003644 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003645 and adjust various state variables.
3646 return 0 on success, -1 on error
3647*/
3648
Alexander Belopolsky40018472011-02-26 01:02:56 +00003649static int
3650unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003651 const char *encoding, const char *reason,
3652 const char **input, const char **inend, Py_ssize_t *startinpos,
3653 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003654 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003655{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003656 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003657
3658 PyObject *restuple = NULL;
3659 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01003660 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003661 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003662 Py_ssize_t requiredsize;
3663 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003664 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003665 int res = -1;
3666
Victor Stinner596a6c42011-11-09 00:02:18 +01003667 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
3668 outsize = PyUnicode_GET_LENGTH(*output);
3669 else
3670 outsize = _PyUnicode_WSTR_LENGTH(*output);
3671
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003672 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003673 *errorHandler = PyCodec_LookupError(errors);
3674 if (*errorHandler == NULL)
3675 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003676 }
3677
Victor Stinner554f3f02010-06-16 23:33:54 +00003678 make_decode_exception(exceptionObject,
3679 encoding,
3680 *input, *inend - *input,
3681 *startinpos, *endinpos,
3682 reason);
3683 if (*exceptionObject == NULL)
3684 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003685
3686 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3687 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003688 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003689 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003690 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003691 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003692 }
3693 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003694 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003695 if (PyUnicode_READY(repunicode) < 0)
3696 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003697
3698 /* Copy back the bytes variables, which might have been modified by the
3699 callback */
3700 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3701 if (!inputobj)
3702 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003703 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003704 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003705 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003706 *input = PyBytes_AS_STRING(inputobj);
3707 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003708 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003709 /* we can DECREF safely, as the exception has another reference,
3710 so the object won't go away. */
3711 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003712
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003713 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003714 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003715 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003716 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3717 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003718 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003719
Victor Stinner596a6c42011-11-09 00:02:18 +01003720 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
3721 /* need more space? (at least enough for what we
3722 have+the replacement+the rest of the string (starting
3723 at the new input position), so we won't have to check space
3724 when there are no errors in the rest of the string) */
3725 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
3726 requiredsize = *outpos + replen + insize-newpos;
3727 if (requiredsize > outsize) {
3728 if (requiredsize<2*outsize)
3729 requiredsize = 2*outsize;
3730 if (unicode_resize(output, requiredsize) < 0)
3731 goto onError;
3732 }
3733 if (unicode_widen(output, PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003734 goto onError;
Victor Stinner596a6c42011-11-09 00:02:18 +01003735 copy_characters(*output, *outpos, repunicode, 0, replen);
3736 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003737 }
Victor Stinner596a6c42011-11-09 00:02:18 +01003738 else {
3739 wchar_t *repwstr;
3740 Py_ssize_t repwlen;
3741 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
3742 if (repwstr == NULL)
3743 goto onError;
3744 /* need more space? (at least enough for what we
3745 have+the replacement+the rest of the string (starting
3746 at the new input position), so we won't have to check space
3747 when there are no errors in the rest of the string) */
3748 requiredsize = *outpos + repwlen + insize-newpos;
3749 if (requiredsize > outsize) {
3750 if (requiredsize < 2*outsize)
3751 requiredsize = 2*outsize;
3752 if (unicode_resize(output, requiredsize) < 0)
3753 goto onError;
3754 }
3755 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
3756 *outpos += repwlen;
3757 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003758 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003759 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003760
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003761 /* we made it! */
3762 res = 0;
3763
Benjamin Peterson29060642009-01-31 22:14:21 +00003764 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003765 Py_XDECREF(restuple);
3766 return res;
3767}
3768
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003769/* --- UTF-7 Codec -------------------------------------------------------- */
3770
Antoine Pitrou244651a2009-05-04 18:56:13 +00003771/* See RFC2152 for details. We encode conservatively and decode liberally. */
3772
3773/* Three simple macros defining base-64. */
3774
3775/* Is c a base-64 character? */
3776
3777#define IS_BASE64(c) \
3778 (((c) >= 'A' && (c) <= 'Z') || \
3779 ((c) >= 'a' && (c) <= 'z') || \
3780 ((c) >= '0' && (c) <= '9') || \
3781 (c) == '+' || (c) == '/')
3782
3783/* given that c is a base-64 character, what is its base-64 value? */
3784
3785#define FROM_BASE64(c) \
3786 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3787 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3788 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3789 (c) == '+' ? 62 : 63)
3790
3791/* What is the base-64 character of the bottom 6 bits of n? */
3792
3793#define TO_BASE64(n) \
3794 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3795
3796/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3797 * decoded as itself. We are permissive on decoding; the only ASCII
3798 * byte not decoding to itself is the + which begins a base64
3799 * string. */
3800
3801#define DECODE_DIRECT(c) \
3802 ((c) <= 127 && (c) != '+')
3803
3804/* The UTF-7 encoder treats ASCII characters differently according to
3805 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3806 * the above). See RFC2152. This array identifies these different
3807 * sets:
3808 * 0 : "Set D"
3809 * alphanumeric and '(),-./:?
3810 * 1 : "Set O"
3811 * !"#$%&*;<=>@[]^_`{|}
3812 * 2 : "whitespace"
3813 * ht nl cr sp
3814 * 3 : special (must be base64 encoded)
3815 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3816 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003817
Tim Petersced69f82003-09-16 20:30:58 +00003818static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003819char utf7_category[128] = {
3820/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3821 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3822/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3823 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3824/* sp ! " # $ % & ' ( ) * + , - . / */
3825 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3826/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3827 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3828/* @ A B C D E F G H I J K L M N O */
3829 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3830/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3831 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3832/* ` a b c d e f g h i j k l m n o */
3833 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3834/* p q r s t u v w x y z { | } ~ del */
3835 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003836};
3837
Antoine Pitrou244651a2009-05-04 18:56:13 +00003838/* ENCODE_DIRECT: this character should be encoded as itself. The
3839 * answer depends on whether we are encoding set O as itself, and also
3840 * on whether we are encoding whitespace as itself. RFC2152 makes it
3841 * clear that the answers to these questions vary between
3842 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003843
Antoine Pitrou244651a2009-05-04 18:56:13 +00003844#define ENCODE_DIRECT(c, directO, directWS) \
3845 ((c) < 128 && (c) > 0 && \
3846 ((utf7_category[(c)] == 0) || \
3847 (directWS && (utf7_category[(c)] == 2)) || \
3848 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003849
Alexander Belopolsky40018472011-02-26 01:02:56 +00003850PyObject *
3851PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003852 Py_ssize_t size,
3853 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003854{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003855 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3856}
3857
Antoine Pitrou244651a2009-05-04 18:56:13 +00003858/* The decoder. The only state we preserve is our read position,
3859 * i.e. how many characters we have consumed. So if we end in the
3860 * middle of a shift sequence we have to back off the read position
3861 * and the output to the beginning of the sequence, otherwise we lose
3862 * all the shift state (seen bits, number of bits seen, high
3863 * surrogate). */
3864
Alexander Belopolsky40018472011-02-26 01:02:56 +00003865PyObject *
3866PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003867 Py_ssize_t size,
3868 const char *errors,
3869 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003870{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003871 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003872 Py_ssize_t startinpos;
3873 Py_ssize_t endinpos;
3874 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003875 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01003876 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003877 const char *errmsg = "";
3878 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003879 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003880 unsigned int base64bits = 0;
3881 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01003882 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003883 PyObject *errorHandler = NULL;
3884 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003885
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003886 /* Start off assuming it's all ASCII. Widen later as necessary. */
3887 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003888 if (!unicode)
3889 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003890 if (size == 0) {
3891 if (consumed)
3892 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01003893 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003894 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003895
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003896 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003897 e = s + size;
3898
3899 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003900 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003901 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003902 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003903
Antoine Pitrou244651a2009-05-04 18:56:13 +00003904 if (inShift) { /* in a base-64 section */
3905 if (IS_BASE64(ch)) { /* consume a base-64 character */
3906 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3907 base64bits += 6;
3908 s++;
3909 if (base64bits >= 16) {
3910 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01003911 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00003912 base64bits -= 16;
3913 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3914 if (surrogate) {
3915 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01003916 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
3917 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003918 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
3919 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003920 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01003921 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003922 }
3923 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01003924 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
3925 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003926 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003927 }
3928 }
Victor Stinner551ac952011-11-29 22:58:13 +01003929 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003930 /* first surrogate */
3931 surrogate = outCh;
3932 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003933 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003934 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
3935 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003936 }
3937 }
3938 }
3939 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003940 inShift = 0;
3941 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003942 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01003943 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
3944 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01003945 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003946 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003947 if (base64bits > 0) { /* left-over bits */
3948 if (base64bits >= 6) {
3949 /* We've seen at least one base-64 character */
3950 errmsg = "partial character in shift sequence";
3951 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003952 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003953 else {
3954 /* Some bits remain; they should be zero */
3955 if (base64buffer != 0) {
3956 errmsg = "non-zero padding bits in shift sequence";
3957 goto utf7Error;
3958 }
3959 }
3960 }
3961 if (ch != '-') {
3962 /* '-' is absorbed; other terminating
3963 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003964 if (unicode_putchar(&unicode, &outpos, ch) < 0)
3965 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003966 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003967 }
3968 }
3969 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003970 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003971 s++; /* consume '+' */
3972 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003973 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003974 if (unicode_putchar(&unicode, &outpos, '+') < 0)
3975 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003976 }
3977 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003978 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003979 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003980 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003981 }
3982 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003983 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003984 if (unicode_putchar(&unicode, &outpos, ch) < 0)
3985 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003986 s++;
3987 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003988 else {
3989 startinpos = s-starts;
3990 s++;
3991 errmsg = "unexpected special character";
3992 goto utf7Error;
3993 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003994 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003995utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003996 endinpos = s-starts;
3997 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003998 errors, &errorHandler,
3999 "utf7", errmsg,
4000 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004001 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004002 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004003 }
4004
Antoine Pitrou244651a2009-05-04 18:56:13 +00004005 /* end of string */
4006
4007 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4008 /* if we're in an inconsistent state, that's an error */
4009 if (surrogate ||
4010 (base64bits >= 6) ||
4011 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004012 endinpos = size;
4013 if (unicode_decode_call_errorhandler(
4014 errors, &errorHandler,
4015 "utf7", "unterminated shift sequence",
4016 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004017 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004018 goto onError;
4019 if (s < e)
4020 goto restart;
4021 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004022 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004023
4024 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004025 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004026 if (inShift) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004027 outpos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004028 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004029 }
4030 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004031 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004032 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004033 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004034
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004035 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004036 goto onError;
4037
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004038 Py_XDECREF(errorHandler);
4039 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004040 return unicode_result(unicode);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004041
Benjamin Peterson29060642009-01-31 22:14:21 +00004042 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004043 Py_XDECREF(errorHandler);
4044 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004045 Py_DECREF(unicode);
4046 return NULL;
4047}
4048
4049
Alexander Belopolsky40018472011-02-26 01:02:56 +00004050PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004051_PyUnicode_EncodeUTF7(PyObject *str,
4052 int base64SetO,
4053 int base64WhiteSpace,
4054 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004055{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004056 int kind;
4057 void *data;
4058 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004059 PyObject *v;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004060 Py_ssize_t allocated;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004061 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004062 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004063 unsigned int base64bits = 0;
4064 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004065 char * out;
4066 char * start;
4067
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004068 if (PyUnicode_READY(str) < 0)
4069 return NULL;
4070 kind = PyUnicode_KIND(str);
4071 data = PyUnicode_DATA(str);
4072 len = PyUnicode_GET_LENGTH(str);
4073
4074 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004075 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004076
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004077 /* It might be possible to tighten this worst case */
4078 allocated = 8 * len;
4079 if (allocated / 8 != len)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004080 return PyErr_NoMemory();
4081
Antoine Pitrou244651a2009-05-04 18:56:13 +00004082 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004083 if (v == NULL)
4084 return NULL;
4085
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004086 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004087 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004088 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004089
Antoine Pitrou244651a2009-05-04 18:56:13 +00004090 if (inShift) {
4091 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4092 /* shifting out */
4093 if (base64bits) { /* output remaining bits */
4094 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4095 base64buffer = 0;
4096 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004097 }
4098 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004099 /* Characters not in the BASE64 set implicitly unshift the sequence
4100 so no '-' is required, except if the character is itself a '-' */
4101 if (IS_BASE64(ch) || ch == '-') {
4102 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004103 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004104 *out++ = (char) ch;
4105 }
4106 else {
4107 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004108 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004109 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004110 else { /* not in a shift sequence */
4111 if (ch == '+') {
4112 *out++ = '+';
4113 *out++ = '-';
4114 }
4115 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4116 *out++ = (char) ch;
4117 }
4118 else {
4119 *out++ = '+';
4120 inShift = 1;
4121 goto encode_char;
4122 }
4123 }
4124 continue;
4125encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004126 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004127 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004128
Antoine Pitrou244651a2009-05-04 18:56:13 +00004129 /* code first surrogate */
4130 base64bits += 16;
4131 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4132 while (base64bits >= 6) {
4133 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4134 base64bits -= 6;
4135 }
4136 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004137 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004138 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004139 base64bits += 16;
4140 base64buffer = (base64buffer << 16) | ch;
4141 while (base64bits >= 6) {
4142 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4143 base64bits -= 6;
4144 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004145 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004146 if (base64bits)
4147 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4148 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004149 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004150 if (_PyBytes_Resize(&v, out - start) < 0)
4151 return NULL;
4152 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004153}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004154PyObject *
4155PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4156 Py_ssize_t size,
4157 int base64SetO,
4158 int base64WhiteSpace,
4159 const char *errors)
4160{
4161 PyObject *result;
4162 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4163 if (tmp == NULL)
4164 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004165 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004166 base64WhiteSpace, errors);
4167 Py_DECREF(tmp);
4168 return result;
4169}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004170
Antoine Pitrou244651a2009-05-04 18:56:13 +00004171#undef IS_BASE64
4172#undef FROM_BASE64
4173#undef TO_BASE64
4174#undef DECODE_DIRECT
4175#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004176
Guido van Rossumd57fd912000-03-10 22:53:23 +00004177/* --- UTF-8 Codec -------------------------------------------------------- */
4178
Tim Petersced69f82003-09-16 20:30:58 +00004179static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004180char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004181 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4182 illegal prefix. See RFC 3629 for details */
4183 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4184 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004185 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004186 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4187 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4188 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4189 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004190 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4191 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004192 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4193 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004194 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4195 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4196 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4197 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4198 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004199};
4200
Alexander Belopolsky40018472011-02-26 01:02:56 +00004201PyObject *
4202PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004203 Py_ssize_t size,
4204 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004205{
Walter Dörwald69652032004-09-07 20:24:22 +00004206 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4207}
4208
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004209#include "stringlib/ucs1lib.h"
4210#include "stringlib/codecs.h"
4211#include "stringlib/undef.h"
4212
4213#include "stringlib/ucs2lib.h"
4214#include "stringlib/codecs.h"
4215#include "stringlib/undef.h"
4216
4217#include "stringlib/ucs4lib.h"
4218#include "stringlib/codecs.h"
4219#include "stringlib/undef.h"
4220
Antoine Pitrouab868312009-01-10 15:40:25 +00004221/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4222#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4223
4224/* Mask to quickly check whether a C 'long' contains a
4225 non-ASCII, UTF8-encoded char. */
4226#if (SIZEOF_LONG == 8)
4227# define ASCII_CHAR_MASK 0x8080808080808080L
4228#elif (SIZEOF_LONG == 4)
4229# define ASCII_CHAR_MASK 0x80808080L
4230#else
4231# error C 'long' size should be either 4 or 8!
4232#endif
4233
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004234/* Scans a UTF-8 string and returns the maximum character to be expected
4235 and the size of the decoded unicode string.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004236
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004237 This function doesn't check for errors, these checks are performed in
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004238 PyUnicode_DecodeUTF8Stateful.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004239 */
4240static Py_UCS4
Victor Stinnera1d12bb2011-12-11 21:53:09 +01004241utf8_scanner(const unsigned char *p, Py_ssize_t string_size, Py_ssize_t *unicode_size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004242{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004243 Py_ssize_t char_count = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004244 const unsigned char *end = p + string_size;
4245 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004246
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004247 assert(unicode_size != NULL);
4248
4249 /* By having a cascade of independent loops which fallback onto each
4250 other, we minimize the amount of work done in the average loop
4251 iteration, and we also maximize the CPU's ability to predict
4252 branches correctly (because a given condition will have always the
4253 same boolean outcome except perhaps in the last iteration of the
4254 corresponding loop).
4255 In the general case this brings us rather close to decoding
4256 performance pre-PEP 393, despite the two-pass decoding.
4257
4258 Note that the pure ASCII loop is not duplicated once a non-ASCII
4259 character has been encountered. It is actually a pessimization (by
4260 a significant factor) to use this loop on text with many non-ASCII
4261 characters, and it is important to avoid bad performance on valid
4262 utf-8 data (invalid utf-8 being a different can of worms).
4263 */
4264
4265 /* ASCII */
4266 for (; p < end; ++p) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004267 /* Only check value if it's not a ASCII char... */
4268 if (*p < 0x80) {
4269 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4270 an explanation. */
4271 if (!((size_t) p & LONG_PTR_MASK)) {
4272 /* Help register allocation */
4273 register const unsigned char *_p = p;
4274 while (_p < aligned_end) {
4275 unsigned long value = *(unsigned long *) _p;
4276 if (value & ASCII_CHAR_MASK)
4277 break;
4278 _p += SIZEOF_LONG;
4279 char_count += SIZEOF_LONG;
4280 }
4281 p = _p;
4282 if (p == end)
4283 break;
4284 }
4285 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004286 if (*p < 0x80)
4287 ++char_count;
4288 else
4289 goto _ucs1loop;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004290 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004291 *unicode_size = char_count;
4292 return 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004293
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004294_ucs1loop:
4295 for (; p < end; ++p) {
4296 if (*p < 0xc4)
4297 char_count += ((*p & 0xc0) != 0x80);
4298 else
4299 goto _ucs2loop;
4300 }
4301 *unicode_size = char_count;
4302 return 255;
4303
4304_ucs2loop:
4305 for (; p < end; ++p) {
4306 if (*p < 0xf0)
4307 char_count += ((*p & 0xc0) != 0x80);
4308 else
4309 goto _ucs4loop;
4310 }
4311 *unicode_size = char_count;
4312 return 65535;
4313
4314_ucs4loop:
4315 for (; p < end; ++p) {
4316 char_count += ((*p & 0xc0) != 0x80);
4317 }
4318 *unicode_size = char_count;
4319 return 65537;
4320}
4321
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004322/* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
Victor Stinner785938e2011-12-11 20:09:03 +01004323 in case of errors. Implicit parameters: unicode, kind, data, onError.
4324 Potential resizing overallocates, so the result needs to shrink at the end.
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004325*/
Victor Stinner785938e2011-12-11 20:09:03 +01004326#define WRITE_MAYBE_FAIL(index, value) \
4327 do { \
4328 Py_ssize_t pos = index; \
4329 if (pos > PyUnicode_GET_LENGTH(unicode) && \
4330 unicode_resize(&unicode, pos + pos/8) < 0) \
4331 goto onError; \
4332 if (unicode_putchar(&unicode, &pos, value) < 0) \
4333 goto onError; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004334 } while (0)
4335
Alexander Belopolsky40018472011-02-26 01:02:56 +00004336PyObject *
Victor Stinner785938e2011-12-11 20:09:03 +01004337decode_utf8_errors(const char *starts,
4338 Py_ssize_t size,
4339 const char *errors,
4340 Py_ssize_t *consumed,
4341 const char *s,
4342 PyObject *unicode,
4343 Py_ssize_t i)
Walter Dörwald69652032004-09-07 20:24:22 +00004344{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004345 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004346 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004347 Py_ssize_t startinpos;
4348 Py_ssize_t endinpos;
Victor Stinner785938e2011-12-11 20:09:03 +01004349 const char *e = starts + size;
4350 const char *aligned_end;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004351 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004352 PyObject *errorHandler = NULL;
4353 PyObject *exc = NULL;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004354
Antoine Pitrouab868312009-01-10 15:40:25 +00004355 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004356
4357 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004358 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004359
4360 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004361 /* Fast path for runs of ASCII characters. Given that common UTF-8
4362 input will consist of an overwhelming majority of ASCII
4363 characters, we try to optimize for this case by checking
4364 as many characters as a C 'long' can contain.
4365 First, check if we can do an aligned read, as most CPUs have
4366 a penalty for unaligned reads.
4367 */
4368 if (!((size_t) s & LONG_PTR_MASK)) {
4369 /* Help register allocation */
4370 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004371 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004372 while (_s < aligned_end) {
4373 /* Read a whole long at a time (either 4 or 8 bytes),
4374 and do a fast unrolled copy if it only contains ASCII
4375 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004376 unsigned long value = *(unsigned long *) _s;
4377 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004378 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004379 WRITE_MAYBE_FAIL(_i+0, _s[0]);
4380 WRITE_MAYBE_FAIL(_i+1, _s[1]);
4381 WRITE_MAYBE_FAIL(_i+2, _s[2]);
4382 WRITE_MAYBE_FAIL(_i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004383#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004384 WRITE_MAYBE_FAIL(_i+4, _s[4]);
4385 WRITE_MAYBE_FAIL(_i+5, _s[5]);
4386 WRITE_MAYBE_FAIL(_i+6, _s[6]);
4387 WRITE_MAYBE_FAIL(_i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004388#endif
4389 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004390 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004391 }
4392 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004393 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004394 if (s == e)
4395 break;
4396 ch = (unsigned char)*s;
4397 }
4398 }
4399
4400 if (ch < 0x80) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004401 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004402 s++;
4403 continue;
4404 }
4405
4406 n = utf8_code_length[ch];
4407
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004408 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004409 if (consumed)
4410 break;
4411 else {
4412 errmsg = "unexpected end of data";
4413 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004414 endinpos = startinpos+1;
4415 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4416 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004417 goto utf8Error;
4418 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004419 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004420
4421 switch (n) {
4422
4423 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004424 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004425 startinpos = s-starts;
4426 endinpos = startinpos+1;
4427 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004428
4429 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004430 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004431 startinpos = s-starts;
4432 endinpos = startinpos+1;
4433 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004434
4435 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004436 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004437 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004438 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004439 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004440 goto utf8Error;
4441 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004442 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004443 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004444 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004445 break;
4446
4447 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004448 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4449 will result in surrogates in range d800-dfff. Surrogates are
4450 not valid UTF-8 so they are rejected.
4451 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4452 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004453 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004454 (s[2] & 0xc0) != 0x80 ||
4455 ((unsigned char)s[0] == 0xE0 &&
4456 (unsigned char)s[1] < 0xA0) ||
4457 ((unsigned char)s[0] == 0xED &&
4458 (unsigned char)s[1] > 0x9F)) {
4459 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004460 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004461 endinpos = startinpos + 1;
4462
4463 /* if s[1] first two bits are 1 and 0, then the invalid
4464 continuation byte is s[2], so increment endinpos by 1,
4465 if not, s[1] is invalid and endinpos doesn't need to
4466 be incremented. */
4467 if ((s[1] & 0xC0) == 0x80)
4468 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004469 goto utf8Error;
4470 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004471 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004472 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004473 WRITE_MAYBE_FAIL(i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004474 break;
4475
4476 case 4:
4477 if ((s[1] & 0xc0) != 0x80 ||
4478 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004479 (s[3] & 0xc0) != 0x80 ||
4480 ((unsigned char)s[0] == 0xF0 &&
4481 (unsigned char)s[1] < 0x90) ||
4482 ((unsigned char)s[0] == 0xF4 &&
4483 (unsigned char)s[1] > 0x8F)) {
4484 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004485 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004486 endinpos = startinpos + 1;
4487 if ((s[1] & 0xC0) == 0x80) {
4488 endinpos++;
4489 if ((s[2] & 0xC0) == 0x80)
4490 endinpos++;
4491 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004492 goto utf8Error;
4493 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004494 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004495 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01004496 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Ezio Melotti57221d02010-07-01 07:32:02 +00004497
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004498 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004499 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004500 }
4501 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004502 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004503
Benjamin Peterson29060642009-01-31 22:14:21 +00004504 utf8Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00004505 if (unicode_decode_call_errorhandler(
4506 errors, &errorHandler,
4507 "utf8", errmsg,
4508 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004509 &unicode, &i))
Benjamin Peterson29060642009-01-31 22:14:21 +00004510 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004511 /* Update data because unicode_decode_call_errorhandler might have
4512 re-created or resized the unicode object. */
Benjamin Peterson29060642009-01-31 22:14:21 +00004513 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004514 }
Walter Dörwald69652032004-09-07 20:24:22 +00004515 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004516 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004517
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004518 /* Adjust length and ready string when it contained errors and
4519 is of the old resizable kind. */
Victor Stinner785938e2011-12-11 20:09:03 +01004520 if (unicode_resize(&unicode, i) < 0)
4521 goto onError;
4522 unicode_adjust_maxchar(&unicode);
4523 if (unicode == NULL)
4524 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004525
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004526 Py_XDECREF(errorHandler);
4527 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004528 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01004529 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004530
Benjamin Peterson29060642009-01-31 22:14:21 +00004531 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004532 Py_XDECREF(errorHandler);
4533 Py_XDECREF(exc);
Victor Stinner785938e2011-12-11 20:09:03 +01004534 Py_XDECREF(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004535 return NULL;
4536}
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004537#undef WRITE_MAYBE_FAIL
Antoine Pitrouab868312009-01-10 15:40:25 +00004538
Victor Stinner785938e2011-12-11 20:09:03 +01004539PyObject *
4540PyUnicode_DecodeUTF8Stateful(const char *s,
4541 Py_ssize_t size,
4542 const char *errors,
4543 Py_ssize_t *consumed)
4544{
4545 Py_UCS4 maxchar = 0;
4546 Py_ssize_t unicode_size;
4547 int has_errors = 0;
4548 PyObject *unicode;
4549 int kind;
4550 void *data;
4551 const char *starts = s;
4552 const char *e;
4553 Py_ssize_t i;
4554
4555 if (size == 0) {
4556 if (consumed)
4557 *consumed = 0;
Victor Stinner382955f2011-12-11 21:44:00 +01004558 Py_INCREF(unicode_empty);
4559 return unicode_empty;
Victor Stinner785938e2011-12-11 20:09:03 +01004560 }
4561
Victor Stinnera1d12bb2011-12-11 21:53:09 +01004562 maxchar = utf8_scanner((const unsigned char *)s, size, &unicode_size);
Victor Stinner785938e2011-12-11 20:09:03 +01004563
4564 /* When the string is ASCII only, just use memcpy and return.
4565 unicode_size may be != size if there is an incomplete UTF-8
4566 sequence at the end of the ASCII block. */
4567 if (maxchar < 128 && size == unicode_size) {
4568 if (consumed)
4569 *consumed = size;
4570 return unicode_fromascii(s, size);
4571 }
4572
4573 unicode = PyUnicode_New(unicode_size, maxchar);
4574 if (!unicode)
4575 return NULL;
4576 kind = PyUnicode_KIND(unicode);
4577 data = PyUnicode_DATA(unicode);
4578
4579 /* Unpack UTF-8 encoded data */
4580 i = 0;
4581 e = starts + size;
4582 switch (kind) {
4583 case PyUnicode_1BYTE_KIND:
4584 has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
4585 break;
4586 case PyUnicode_2BYTE_KIND:
4587 has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
4588 break;
4589 case PyUnicode_4BYTE_KIND:
4590 has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
4591 break;
4592 }
4593 if (!has_errors) {
4594 /* Ensure the unicode size calculation was correct */
4595 assert(i == unicode_size);
4596 assert(s == e);
4597 if (consumed)
4598 *consumed = size;
4599 return unicode;
4600 }
4601
4602 /* In case of errors, maxchar and size computation might be incorrect;
4603 code below refits and resizes as necessary. */
4604 return decode_utf8_errors(starts, size, errors, consumed, s, unicode, i);
4605}
4606
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004607#ifdef __APPLE__
4608
4609/* Simplified UTF-8 decoder using surrogateescape error handler,
4610 used to decode the command line arguments on Mac OS X. */
4611
4612wchar_t*
4613_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4614{
4615 int n;
4616 const char *e;
4617 wchar_t *unicode, *p;
4618
4619 /* Note: size will always be longer than the resulting Unicode
4620 character count */
4621 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4622 PyErr_NoMemory();
4623 return NULL;
4624 }
4625 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4626 if (!unicode)
4627 return NULL;
4628
4629 /* Unpack UTF-8 encoded data */
4630 p = unicode;
4631 e = s + size;
4632 while (s < e) {
4633 Py_UCS4 ch = (unsigned char)*s;
4634
4635 if (ch < 0x80) {
4636 *p++ = (wchar_t)ch;
4637 s++;
4638 continue;
4639 }
4640
4641 n = utf8_code_length[ch];
4642 if (s + n > e) {
4643 goto surrogateescape;
4644 }
4645
4646 switch (n) {
4647 case 0:
4648 case 1:
4649 goto surrogateescape;
4650
4651 case 2:
4652 if ((s[1] & 0xc0) != 0x80)
4653 goto surrogateescape;
4654 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4655 assert ((ch > 0x007F) && (ch <= 0x07FF));
4656 *p++ = (wchar_t)ch;
4657 break;
4658
4659 case 3:
4660 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4661 will result in surrogates in range d800-dfff. Surrogates are
4662 not valid UTF-8 so they are rejected.
4663 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4664 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4665 if ((s[1] & 0xc0) != 0x80 ||
4666 (s[2] & 0xc0) != 0x80 ||
4667 ((unsigned char)s[0] == 0xE0 &&
4668 (unsigned char)s[1] < 0xA0) ||
4669 ((unsigned char)s[0] == 0xED &&
4670 (unsigned char)s[1] > 0x9F)) {
4671
4672 goto surrogateescape;
4673 }
4674 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4675 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004676 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004677 break;
4678
4679 case 4:
4680 if ((s[1] & 0xc0) != 0x80 ||
4681 (s[2] & 0xc0) != 0x80 ||
4682 (s[3] & 0xc0) != 0x80 ||
4683 ((unsigned char)s[0] == 0xF0 &&
4684 (unsigned char)s[1] < 0x90) ||
4685 ((unsigned char)s[0] == 0xF4 &&
4686 (unsigned char)s[1] > 0x8F)) {
4687 goto surrogateescape;
4688 }
4689 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4690 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01004691 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004692
4693#if SIZEOF_WCHAR_T == 4
4694 *p++ = (wchar_t)ch;
4695#else
4696 /* compute and append the two surrogates: */
Victor Stinner551ac952011-11-29 22:58:13 +01004697 *p++ = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4698 *p++ = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004699#endif
4700 break;
4701 }
4702 s += n;
4703 continue;
4704
4705 surrogateescape:
4706 *p++ = 0xDC00 + ch;
4707 s++;
4708 }
4709 *p = L'\0';
4710 return unicode;
4711}
4712
4713#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004714
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004715/* Primary internal function which creates utf8 encoded bytes objects.
4716
4717 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004718 and allocate exactly as much space needed at the end. Else allocate the
4719 maximum possible needed (4 result bytes per Unicode character), and return
4720 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004721*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004722PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004723_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004724{
Tim Peters602f7402002-04-27 18:03:26 +00004725#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004726
Guido van Rossum98297ee2007-11-06 21:34:58 +00004727 Py_ssize_t i; /* index into s of next input byte */
4728 PyObject *result; /* result string object */
4729 char *p; /* next free byte in output buffer */
4730 Py_ssize_t nallocated; /* number of result bytes allocated */
4731 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004732 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004733 PyObject *errorHandler = NULL;
4734 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004735 int kind;
4736 void *data;
4737 Py_ssize_t size;
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004738 PyObject *rep = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004739
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004740 if (!PyUnicode_Check(unicode)) {
4741 PyErr_BadArgument();
4742 return NULL;
4743 }
4744
4745 if (PyUnicode_READY(unicode) == -1)
4746 return NULL;
4747
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004748 if (PyUnicode_UTF8(unicode))
4749 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4750 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004751
4752 kind = PyUnicode_KIND(unicode);
4753 data = PyUnicode_DATA(unicode);
4754 size = PyUnicode_GET_LENGTH(unicode);
4755
Tim Peters602f7402002-04-27 18:03:26 +00004756 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004757
Tim Peters602f7402002-04-27 18:03:26 +00004758 if (size <= MAX_SHORT_UNICHARS) {
4759 /* Write into the stack buffer; nallocated can't overflow.
4760 * At the end, we'll allocate exactly as much heap space as it
4761 * turns out we need.
4762 */
4763 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004764 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004765 p = stackbuf;
4766 }
4767 else {
4768 /* Overallocate on the heap, and give the excess back at the end. */
4769 nallocated = size * 4;
4770 if (nallocated / 4 != size) /* overflow! */
4771 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004772 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004773 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004774 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004775 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004776 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004777
Tim Peters602f7402002-04-27 18:03:26 +00004778 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004779 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004780
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004781 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004782 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004783 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004784
Guido van Rossumd57fd912000-03-10 22:53:23 +00004785 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004786 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004787 *p++ = (char)(0xc0 | (ch >> 6));
4788 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner551ac952011-11-29 22:58:13 +01004789 } else if (Py_UNICODE_IS_SURROGATE(ch)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004790 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004791 Py_ssize_t repsize, k, startpos;
4792 startpos = i-1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004793 rep = unicode_encode_call_errorhandler(
4794 errors, &errorHandler, "utf-8", "surrogates not allowed",
Victor Stinner7931d9a2011-11-04 00:22:48 +01004795 unicode, &exc, startpos, startpos+1, &newpos);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004796 if (!rep)
4797 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004798
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004799 if (PyBytes_Check(rep))
4800 repsize = PyBytes_GET_SIZE(rep);
4801 else
Victor Stinner9e30aa52011-11-21 02:49:52 +01004802 repsize = PyUnicode_GET_LENGTH(rep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004803
4804 if (repsize > 4) {
4805 Py_ssize_t offset;
4806
4807 if (result == NULL)
4808 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004809 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004810 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004811
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004812 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4813 /* integer overflow */
4814 PyErr_NoMemory();
4815 goto error;
4816 }
4817 nallocated += repsize - 4;
4818 if (result != NULL) {
4819 if (_PyBytes_Resize(&result, nallocated) < 0)
4820 goto error;
4821 } else {
4822 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004823 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004824 goto error;
4825 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4826 }
4827 p = PyBytes_AS_STRING(result) + offset;
4828 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004829
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004830 if (PyBytes_Check(rep)) {
4831 char *prep = PyBytes_AS_STRING(rep);
4832 for(k = repsize; k > 0; k--)
4833 *p++ = *prep++;
4834 } else /* rep is unicode */ {
Victor Stinnera98b28c2011-11-10 20:21:49 +01004835 enum PyUnicode_Kind repkind;
4836 void *repdata;
4837
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004838 if (PyUnicode_READY(rep) < 0)
Victor Stinnera98b28c2011-11-10 20:21:49 +01004839 goto error;
Victor Stinnera98b28c2011-11-10 20:21:49 +01004840 repkind = PyUnicode_KIND(rep);
4841 repdata = PyUnicode_DATA(rep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004842
4843 for(k=0; k<repsize; k++) {
Victor Stinnera98b28c2011-11-10 20:21:49 +01004844 Py_UCS4 c = PyUnicode_READ(repkind, repdata, k);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004845 if (0x80 <= c) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01004846 raise_encode_exception(&exc, "utf-8",
Victor Stinner7931d9a2011-11-04 00:22:48 +01004847 unicode,
Martin v. Löwis9e816682011-11-02 12:45:42 +01004848 i-1, i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004849 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004850 goto error;
4851 }
Victor Stinnera98b28c2011-11-10 20:21:49 +01004852 *p++ = (char)c;
Victor Stinner31be90b2010-04-22 19:38:16 +00004853 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004854 }
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004855 Py_CLEAR(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004856 } else if (ch < 0x10000) {
4857 *p++ = (char)(0xe0 | (ch >> 12));
4858 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4859 *p++ = (char)(0x80 | (ch & 0x3f));
4860 } else /* ch >= 0x10000 */ {
Victor Stinner8faf8212011-12-08 22:14:11 +01004861 assert(ch <= MAX_UNICODE);
Tim Peters602f7402002-04-27 18:03:26 +00004862 /* Encode UCS4 Unicode ordinals */
4863 *p++ = (char)(0xf0 | (ch >> 18));
4864 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4865 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4866 *p++ = (char)(0x80 | (ch & 0x3f));
4867 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004868 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004869
Guido van Rossum98297ee2007-11-06 21:34:58 +00004870 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004871 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004872 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004873 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004874 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004875 }
4876 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004877 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004878 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004879 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004880 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004881 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004882
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004883 Py_XDECREF(errorHandler);
4884 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004885 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004886 error:
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004887 Py_XDECREF(rep);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004888 Py_XDECREF(errorHandler);
4889 Py_XDECREF(exc);
4890 Py_XDECREF(result);
4891 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004892
Tim Peters602f7402002-04-27 18:03:26 +00004893#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004894}
4895
Alexander Belopolsky40018472011-02-26 01:02:56 +00004896PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004897PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4898 Py_ssize_t size,
4899 const char *errors)
4900{
4901 PyObject *v, *unicode;
4902
4903 unicode = PyUnicode_FromUnicode(s, size);
4904 if (unicode == NULL)
4905 return NULL;
4906 v = _PyUnicode_AsUTF8String(unicode, errors);
4907 Py_DECREF(unicode);
4908 return v;
4909}
4910
4911PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004912PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004913{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004914 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004915}
4916
Walter Dörwald41980ca2007-08-16 21:55:45 +00004917/* --- UTF-32 Codec ------------------------------------------------------- */
4918
4919PyObject *
4920PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004921 Py_ssize_t size,
4922 const char *errors,
4923 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004924{
4925 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4926}
4927
4928PyObject *
4929PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004930 Py_ssize_t size,
4931 const char *errors,
4932 int *byteorder,
4933 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004934{
4935 const char *starts = s;
4936 Py_ssize_t startinpos;
4937 Py_ssize_t endinpos;
4938 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004939 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004940 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004941 int bo = 0; /* assume native ordering by default */
4942 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004943 /* Offsets from q for retrieving bytes in the right order. */
4944#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4945 int iorder[] = {0, 1, 2, 3};
4946#else
4947 int iorder[] = {3, 2, 1, 0};
4948#endif
4949 PyObject *errorHandler = NULL;
4950 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004951
Walter Dörwald41980ca2007-08-16 21:55:45 +00004952 q = (unsigned char *)s;
4953 e = q + size;
4954
4955 if (byteorder)
4956 bo = *byteorder;
4957
4958 /* Check for BOM marks (U+FEFF) in the input and adjust current
4959 byte order setting accordingly. In native mode, the leading BOM
4960 mark is skipped, in all other modes, it is copied to the output
4961 stream as-is (giving a ZWNBSP character). */
4962 if (bo == 0) {
4963 if (size >= 4) {
4964 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004965 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004966#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004967 if (bom == 0x0000FEFF) {
4968 q += 4;
4969 bo = -1;
4970 }
4971 else if (bom == 0xFFFE0000) {
4972 q += 4;
4973 bo = 1;
4974 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004975#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004976 if (bom == 0x0000FEFF) {
4977 q += 4;
4978 bo = 1;
4979 }
4980 else if (bom == 0xFFFE0000) {
4981 q += 4;
4982 bo = -1;
4983 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004984#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004985 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004986 }
4987
4988 if (bo == -1) {
4989 /* force LE */
4990 iorder[0] = 0;
4991 iorder[1] = 1;
4992 iorder[2] = 2;
4993 iorder[3] = 3;
4994 }
4995 else if (bo == 1) {
4996 /* force BE */
4997 iorder[0] = 3;
4998 iorder[1] = 2;
4999 iorder[2] = 1;
5000 iorder[3] = 0;
5001 }
5002
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005003 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005004 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005005 if (!unicode)
5006 return NULL;
5007 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005008 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005009 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005010
Walter Dörwald41980ca2007-08-16 21:55:45 +00005011 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005012 Py_UCS4 ch;
5013 /* remaining bytes at the end? (size should be divisible by 4) */
5014 if (e-q<4) {
5015 if (consumed)
5016 break;
5017 errmsg = "truncated data";
5018 startinpos = ((const char *)q)-starts;
5019 endinpos = ((const char *)e)-starts;
5020 goto utf32Error;
5021 /* The remaining input chars are ignored if the callback
5022 chooses to skip the input */
5023 }
5024 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5025 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005026
Benjamin Peterson29060642009-01-31 22:14:21 +00005027 if (ch >= 0x110000)
5028 {
5029 errmsg = "codepoint not in range(0x110000)";
5030 startinpos = ((const char *)q)-starts;
5031 endinpos = startinpos+4;
5032 goto utf32Error;
5033 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005034 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5035 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005036 q += 4;
5037 continue;
5038 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005039 if (unicode_decode_call_errorhandler(
5040 errors, &errorHandler,
5041 "utf32", errmsg,
5042 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005043 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005044 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005045 }
5046
5047 if (byteorder)
5048 *byteorder = bo;
5049
5050 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005051 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005052
5053 /* Adjust length */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005054 if (PyUnicode_Resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005055 goto onError;
5056
5057 Py_XDECREF(errorHandler);
5058 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005059 return unicode_result(unicode);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005060
Benjamin Peterson29060642009-01-31 22:14:21 +00005061 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005062 Py_DECREF(unicode);
5063 Py_XDECREF(errorHandler);
5064 Py_XDECREF(exc);
5065 return NULL;
5066}
5067
5068PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005069_PyUnicode_EncodeUTF32(PyObject *str,
5070 const char *errors,
5071 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005072{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005073 int kind;
5074 void *data;
5075 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005076 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005077 unsigned char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005078 Py_ssize_t nsize, bytesize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005079 /* Offsets from p for storing byte pairs in the right order. */
5080#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5081 int iorder[] = {0, 1, 2, 3};
5082#else
5083 int iorder[] = {3, 2, 1, 0};
5084#endif
5085
Benjamin Peterson29060642009-01-31 22:14:21 +00005086#define STORECHAR(CH) \
5087 do { \
5088 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5089 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5090 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5091 p[iorder[0]] = (CH) & 0xff; \
5092 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005093 } while(0)
5094
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005095 if (!PyUnicode_Check(str)) {
5096 PyErr_BadArgument();
5097 return NULL;
5098 }
5099 if (PyUnicode_READY(str) < 0)
5100 return NULL;
5101 kind = PyUnicode_KIND(str);
5102 data = PyUnicode_DATA(str);
5103 len = PyUnicode_GET_LENGTH(str);
5104
5105 nsize = len + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005106 bytesize = nsize * 4;
5107 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005108 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005109 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005110 if (v == NULL)
5111 return NULL;
5112
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005113 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005114 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005115 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005116 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005117 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005118
5119 if (byteorder == -1) {
5120 /* force LE */
5121 iorder[0] = 0;
5122 iorder[1] = 1;
5123 iorder[2] = 2;
5124 iorder[3] = 3;
5125 }
5126 else if (byteorder == 1) {
5127 /* force BE */
5128 iorder[0] = 3;
5129 iorder[1] = 2;
5130 iorder[2] = 1;
5131 iorder[3] = 0;
5132 }
5133
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005134 for (i = 0; i < len; i++)
5135 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005136
5137 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005138 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005139#undef STORECHAR
5140}
5141
Alexander Belopolsky40018472011-02-26 01:02:56 +00005142PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005143PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5144 Py_ssize_t size,
5145 const char *errors,
5146 int byteorder)
5147{
5148 PyObject *result;
5149 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5150 if (tmp == NULL)
5151 return NULL;
5152 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5153 Py_DECREF(tmp);
5154 return result;
5155}
5156
5157PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005158PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005159{
Victor Stinnerb960b342011-11-20 19:12:52 +01005160 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005161}
5162
Guido van Rossumd57fd912000-03-10 22:53:23 +00005163/* --- UTF-16 Codec ------------------------------------------------------- */
5164
Tim Peters772747b2001-08-09 22:21:55 +00005165PyObject *
5166PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005167 Py_ssize_t size,
5168 const char *errors,
5169 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005170{
Walter Dörwald69652032004-09-07 20:24:22 +00005171 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5172}
5173
Antoine Pitrouab868312009-01-10 15:40:25 +00005174/* Two masks for fast checking of whether a C 'long' may contain
5175 UTF16-encoded surrogate characters. This is an efficient heuristic,
5176 assuming that non-surrogate characters with a code point >= 0x8000 are
5177 rare in most input.
5178 FAST_CHAR_MASK is used when the input is in native byte ordering,
5179 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005180*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005181#if (SIZEOF_LONG == 8)
5182# define FAST_CHAR_MASK 0x8000800080008000L
5183# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
5184#elif (SIZEOF_LONG == 4)
5185# define FAST_CHAR_MASK 0x80008000L
5186# define SWAPPED_FAST_CHAR_MASK 0x00800080L
5187#else
5188# error C 'long' size should be either 4 or 8!
5189#endif
5190
Walter Dörwald69652032004-09-07 20:24:22 +00005191PyObject *
5192PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005193 Py_ssize_t size,
5194 const char *errors,
5195 int *byteorder,
5196 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005197{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005198 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005199 Py_ssize_t startinpos;
5200 Py_ssize_t endinpos;
5201 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005202 PyObject *unicode;
Antoine Pitrouab868312009-01-10 15:40:25 +00005203 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005204 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005205 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005206 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005207 /* Offsets from q for retrieving byte pairs in the right order. */
5208#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5209 int ihi = 1, ilo = 0;
5210#else
5211 int ihi = 0, ilo = 1;
5212#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005213 PyObject *errorHandler = NULL;
5214 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005215
5216 /* Note: size will always be longer than the resulting Unicode
5217 character count */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005218 unicode = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005219 if (!unicode)
5220 return NULL;
5221 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005222 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005223 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005224
Tim Peters772747b2001-08-09 22:21:55 +00005225 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005226 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005227
5228 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005229 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005230
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005231 /* Check for BOM marks (U+FEFF) in the input and adjust current
5232 byte order setting accordingly. In native mode, the leading BOM
5233 mark is skipped, in all other modes, it is copied to the output
5234 stream as-is (giving a ZWNBSP character). */
5235 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005236 if (size >= 2) {
Victor Stinner24729f32011-11-10 20:31:37 +01005237 const Py_UCS4 bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005238#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005239 if (bom == 0xFEFF) {
5240 q += 2;
5241 bo = -1;
5242 }
5243 else if (bom == 0xFFFE) {
5244 q += 2;
5245 bo = 1;
5246 }
Tim Petersced69f82003-09-16 20:30:58 +00005247#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005248 if (bom == 0xFEFF) {
5249 q += 2;
5250 bo = 1;
5251 }
5252 else if (bom == 0xFFFE) {
5253 q += 2;
5254 bo = -1;
5255 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005256#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005257 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005258 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005259
Tim Peters772747b2001-08-09 22:21:55 +00005260 if (bo == -1) {
5261 /* force LE */
5262 ihi = 1;
5263 ilo = 0;
5264 }
5265 else if (bo == 1) {
5266 /* force BE */
5267 ihi = 0;
5268 ilo = 1;
5269 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005270#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5271 native_ordering = ilo < ihi;
5272#else
5273 native_ordering = ilo > ihi;
5274#endif
Tim Peters772747b2001-08-09 22:21:55 +00005275
Antoine Pitrouab868312009-01-10 15:40:25 +00005276 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005277 while (q < e) {
Victor Stinner24729f32011-11-10 20:31:37 +01005278 Py_UCS4 ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005279 /* First check for possible aligned read of a C 'long'. Unaligned
5280 reads are more expensive, better to defer to another iteration. */
5281 if (!((size_t) q & LONG_PTR_MASK)) {
5282 /* Fast path for runs of non-surrogate chars. */
5283 register const unsigned char *_q = q;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005284 int kind = PyUnicode_KIND(unicode);
5285 void *data = PyUnicode_DATA(unicode);
5286 while (_q < aligned_end) {
5287 unsigned long block = * (unsigned long *) _q;
5288 unsigned short *pblock = (unsigned short*)&block;
5289 Py_UCS4 maxch;
5290 if (native_ordering) {
5291 /* Can use buffer directly */
5292 if (block & FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005293 break;
Antoine Pitrouab868312009-01-10 15:40:25 +00005294 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005295 else {
5296 /* Need to byte-swap */
5297 unsigned char *_p = (unsigned char*)pblock;
5298 if (block & SWAPPED_FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005299 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005300 _p[0] = _q[1];
5301 _p[1] = _q[0];
5302 _p[2] = _q[3];
5303 _p[3] = _q[2];
Antoine Pitrouab868312009-01-10 15:40:25 +00005304#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005305 _p[4] = _q[5];
5306 _p[5] = _q[4];
5307 _p[6] = _q[7];
5308 _p[7] = _q[6];
Antoine Pitrouab868312009-01-10 15:40:25 +00005309#endif
Antoine Pitrouab868312009-01-10 15:40:25 +00005310 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005311 maxch = Py_MAX(pblock[0], pblock[1]);
5312#if SIZEOF_LONG == 8
5313 maxch = Py_MAX(maxch, Py_MAX(pblock[2], pblock[3]));
5314#endif
5315 if (maxch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
5316 if (unicode_widen(&unicode, maxch) < 0)
5317 goto onError;
5318 kind = PyUnicode_KIND(unicode);
5319 data = PyUnicode_DATA(unicode);
5320 }
5321 PyUnicode_WRITE(kind, data, outpos++, pblock[0]);
5322 PyUnicode_WRITE(kind, data, outpos++, pblock[1]);
5323#if SIZEOF_LONG == 8
5324 PyUnicode_WRITE(kind, data, outpos++, pblock[2]);
5325 PyUnicode_WRITE(kind, data, outpos++, pblock[3]);
5326#endif
5327 _q += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00005328 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005329 q = _q;
5330 if (q >= e)
5331 break;
5332 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005333 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005334
Benjamin Peterson14339b62009-01-31 16:36:08 +00005335 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005336
Victor Stinner551ac952011-11-29 22:58:13 +01005337 if (!Py_UNICODE_IS_SURROGATE(ch)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005338 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5339 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005340 continue;
5341 }
5342
5343 /* UTF-16 code pair: */
5344 if (q > e) {
5345 errmsg = "unexpected end of data";
5346 startinpos = (((const char *)q) - 2) - starts;
5347 endinpos = ((const char *)e) + 1 - starts;
5348 goto utf16Error;
5349 }
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005350 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)) {
5351 Py_UCS4 ch2 = (q[ihi] << 8) | q[ilo];
Benjamin Peterson29060642009-01-31 22:14:21 +00005352 q += 2;
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005353 if (Py_UNICODE_IS_LOW_SURROGATE(ch2)) {
Victor Stinner62aa4d02011-11-09 00:03:45 +01005354 if (unicode_putchar(&unicode, &outpos,
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005355 Py_UNICODE_JOIN_SURROGATES(ch, ch2)) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005356 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005357 continue;
5358 }
5359 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005360 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005361 startinpos = (((const char *)q)-4)-starts;
5362 endinpos = startinpos+2;
5363 goto utf16Error;
5364 }
5365
Benjamin Peterson14339b62009-01-31 16:36:08 +00005366 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005367 errmsg = "illegal encoding";
5368 startinpos = (((const char *)q)-2)-starts;
5369 endinpos = startinpos+2;
5370 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005371
Benjamin Peterson29060642009-01-31 22:14:21 +00005372 utf16Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005373 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005374 errors,
5375 &errorHandler,
5376 "utf16", errmsg,
5377 &starts,
5378 (const char **)&e,
5379 &startinpos,
5380 &endinpos,
5381 &exc,
5382 (const char **)&q,
5383 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005384 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005385 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005386 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005387 /* remaining byte at the end? (size should be even) */
5388 if (e == q) {
5389 if (!consumed) {
5390 errmsg = "truncated data";
5391 startinpos = ((const char *)q) - starts;
5392 endinpos = ((const char *)e) + 1 - starts;
Antoine Pitrouab868312009-01-10 15:40:25 +00005393 if (unicode_decode_call_errorhandler(
5394 errors,
5395 &errorHandler,
5396 "utf16", errmsg,
5397 &starts,
5398 (const char **)&e,
5399 &startinpos,
5400 &endinpos,
5401 &exc,
5402 (const char **)&q,
5403 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005404 &outpos))
Antoine Pitrouab868312009-01-10 15:40:25 +00005405 goto onError;
5406 /* The remaining input chars are ignored if the callback
5407 chooses to skip the input */
5408 }
5409 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005410
5411 if (byteorder)
5412 *byteorder = bo;
5413
Walter Dörwald69652032004-09-07 20:24:22 +00005414 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005415 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005416
Guido van Rossumd57fd912000-03-10 22:53:23 +00005417 /* Adjust length */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005418 if (PyUnicode_Resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005419 goto onError;
5420
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005421 Py_XDECREF(errorHandler);
5422 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005423 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005424
Benjamin Peterson29060642009-01-31 22:14:21 +00005425 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005426 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005427 Py_XDECREF(errorHandler);
5428 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005429 return NULL;
5430}
5431
Antoine Pitrouab868312009-01-10 15:40:25 +00005432#undef FAST_CHAR_MASK
5433#undef SWAPPED_FAST_CHAR_MASK
5434
Tim Peters772747b2001-08-09 22:21:55 +00005435PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005436_PyUnicode_EncodeUTF16(PyObject *str,
5437 const char *errors,
5438 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005439{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005440 int kind;
5441 void *data;
5442 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005443 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005444 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005445 Py_ssize_t nsize, bytesize;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005446 Py_ssize_t i, pairs;
Tim Peters772747b2001-08-09 22:21:55 +00005447 /* Offsets from p for storing byte pairs in the right order. */
5448#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5449 int ihi = 1, ilo = 0;
5450#else
5451 int ihi = 0, ilo = 1;
5452#endif
5453
Benjamin Peterson29060642009-01-31 22:14:21 +00005454#define STORECHAR(CH) \
5455 do { \
5456 p[ihi] = ((CH) >> 8) & 0xff; \
5457 p[ilo] = (CH) & 0xff; \
5458 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005459 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005460
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005461 if (!PyUnicode_Check(str)) {
5462 PyErr_BadArgument();
5463 return NULL;
5464 }
5465 if (PyUnicode_READY(str) < 0)
5466 return NULL;
5467 kind = PyUnicode_KIND(str);
5468 data = PyUnicode_DATA(str);
5469 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005470
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005471 pairs = 0;
5472 if (kind == PyUnicode_4BYTE_KIND)
5473 for (i = 0; i < len; i++)
5474 if (PyUnicode_READ(kind, data, i) >= 0x10000)
5475 pairs++;
5476 /* 2 * (len + pairs + (byteorder == 0)) */
5477 if (len > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005478 return PyErr_NoMemory();
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005479 nsize = len + pairs + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005480 bytesize = nsize * 2;
5481 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005482 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005483 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005484 if (v == NULL)
5485 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005486
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005487 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005488 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005489 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005490 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005491 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005492
5493 if (byteorder == -1) {
5494 /* force LE */
5495 ihi = 1;
5496 ilo = 0;
5497 }
5498 else if (byteorder == 1) {
5499 /* force BE */
5500 ihi = 0;
5501 ilo = 1;
5502 }
5503
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005504 for (i = 0; i < len; i++) {
5505 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5506 Py_UCS4 ch2 = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +00005507 if (ch >= 0x10000) {
Victor Stinner551ac952011-11-29 22:58:13 +01005508 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
5509 ch = Py_UNICODE_HIGH_SURROGATE(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00005510 }
Tim Peters772747b2001-08-09 22:21:55 +00005511 STORECHAR(ch);
5512 if (ch2)
5513 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005514 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005515
5516 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005517 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005518#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005519}
5520
Alexander Belopolsky40018472011-02-26 01:02:56 +00005521PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005522PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5523 Py_ssize_t size,
5524 const char *errors,
5525 int byteorder)
5526{
5527 PyObject *result;
5528 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5529 if (tmp == NULL)
5530 return NULL;
5531 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5532 Py_DECREF(tmp);
5533 return result;
5534}
5535
5536PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005537PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005538{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005539 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005540}
5541
5542/* --- Unicode Escape Codec ----------------------------------------------- */
5543
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005544/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5545 if all the escapes in the string make it still a valid ASCII string.
5546 Returns -1 if any escapes were found which cause the string to
5547 pop out of ASCII range. Otherwise returns the length of the
5548 required buffer to hold the string.
5549 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005550static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005551length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5552{
5553 const unsigned char *p = (const unsigned char *)s;
5554 const unsigned char *end = p + size;
5555 Py_ssize_t length = 0;
5556
5557 if (size < 0)
5558 return -1;
5559
5560 for (; p < end; ++p) {
5561 if (*p > 127) {
5562 /* Non-ASCII */
5563 return -1;
5564 }
5565 else if (*p != '\\') {
5566 /* Normal character */
5567 ++length;
5568 }
5569 else {
5570 /* Backslash-escape, check next char */
5571 ++p;
5572 /* Escape sequence reaches till end of string or
5573 non-ASCII follow-up. */
5574 if (p >= end || *p > 127)
5575 return -1;
5576 switch (*p) {
5577 case '\n':
5578 /* backslash + \n result in zero characters */
5579 break;
5580 case '\\': case '\'': case '\"':
5581 case 'b': case 'f': case 't':
5582 case 'n': case 'r': case 'v': case 'a':
5583 ++length;
5584 break;
5585 case '0': case '1': case '2': case '3':
5586 case '4': case '5': case '6': case '7':
5587 case 'x': case 'u': case 'U': case 'N':
5588 /* these do not guarantee ASCII characters */
5589 return -1;
5590 default:
5591 /* count the backslash + the other character */
5592 length += 2;
5593 }
5594 }
5595 }
5596 return length;
5597}
5598
Fredrik Lundh06d12682001-01-24 07:59:11 +00005599static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005600
Alexander Belopolsky40018472011-02-26 01:02:56 +00005601PyObject *
5602PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005603 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005604 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005605{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005606 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005607 Py_ssize_t startinpos;
5608 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005609 int j;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005610 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005611 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005612 char* message;
5613 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005614 PyObject *errorHandler = NULL;
5615 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005616 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005617 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005618
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005619 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005620
5621 /* After length_of_escaped_ascii_string() there are two alternatives,
5622 either the string is pure ASCII with named escapes like \n, etc.
5623 and we determined it's exact size (common case)
5624 or it contains \x, \u, ... escape sequences. then we create a
5625 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005626 if (len >= 0) {
5627 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005628 if (!v)
5629 goto onError;
5630 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005631 }
5632 else {
5633 /* Escaped strings will always be longer than the resulting
5634 Unicode string, so we start with size here and then reduce the
5635 length after conversion to the true value.
5636 (but if the error callback returns a long replacement string
5637 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005638 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005639 if (!v)
5640 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005641 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005642 }
5643
Guido van Rossumd57fd912000-03-10 22:53:23 +00005644 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005645 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005646 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005647 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005648
Guido van Rossumd57fd912000-03-10 22:53:23 +00005649 while (s < end) {
5650 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005651 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005652 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005653
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005654 /* The only case in which i == ascii_length is a backslash
5655 followed by a newline. */
5656 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005657
Guido van Rossumd57fd912000-03-10 22:53:23 +00005658 /* Non-escape characters are interpreted as Unicode ordinals */
5659 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005660 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5661 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005662 continue;
5663 }
5664
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005665 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005666 /* \ - Escapes */
5667 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005668 c = *s++;
5669 if (s > end)
5670 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005671
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005672 /* The only case in which i == ascii_length is a backslash
5673 followed by a newline. */
5674 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005675
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005676 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005677
Benjamin Peterson29060642009-01-31 22:14:21 +00005678 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005679#define WRITECHAR(ch) \
5680 do { \
5681 if (unicode_putchar(&v, &i, ch) < 0) \
5682 goto onError; \
5683 }while(0)
5684
Guido van Rossumd57fd912000-03-10 22:53:23 +00005685 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005686 case '\\': WRITECHAR('\\'); break;
5687 case '\'': WRITECHAR('\''); break;
5688 case '\"': WRITECHAR('\"'); break;
5689 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005690 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005691 case 'f': WRITECHAR('\014'); break;
5692 case 't': WRITECHAR('\t'); break;
5693 case 'n': WRITECHAR('\n'); break;
5694 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005695 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005696 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005697 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005698 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699
Benjamin Peterson29060642009-01-31 22:14:21 +00005700 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005701 case '0': case '1': case '2': case '3':
5702 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005703 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005704 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005705 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005706 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005707 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005708 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005709 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005710 break;
5711
Benjamin Peterson29060642009-01-31 22:14:21 +00005712 /* hex escapes */
5713 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005714 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005715 digits = 2;
5716 message = "truncated \\xXX escape";
5717 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005718
Benjamin Peterson29060642009-01-31 22:14:21 +00005719 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005720 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005721 digits = 4;
5722 message = "truncated \\uXXXX escape";
5723 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005724
Benjamin Peterson29060642009-01-31 22:14:21 +00005725 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005726 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005727 digits = 8;
5728 message = "truncated \\UXXXXXXXX escape";
5729 hexescape:
5730 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005731 if (s+digits>end) {
5732 endinpos = size;
5733 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005734 errors, &errorHandler,
5735 "unicodeescape", "end of string in escape sequence",
5736 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005737 &v, &i))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005738 goto onError;
5739 goto nextByte;
5740 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005741 for (j = 0; j < digits; ++j) {
5742 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005743 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005744 endinpos = (s+j+1)-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005745 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005746 errors, &errorHandler,
5747 "unicodeescape", message,
5748 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005749 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005750 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005751 len = PyUnicode_GET_LENGTH(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005752 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005753 }
5754 chr = (chr<<4) & ~0xF;
5755 if (c >= '0' && c <= '9')
5756 chr += c - '0';
5757 else if (c >= 'a' && c <= 'f')
5758 chr += 10 + c - 'a';
5759 else
5760 chr += 10 + c - 'A';
5761 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005762 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005763 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005764 /* _decoding_error will have already written into the
5765 target buffer. */
5766 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005767 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005768 /* when we get here, chr is a 32-bit unicode character */
Victor Stinner8faf8212011-12-08 22:14:11 +01005769 if (chr <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005770 WRITECHAR(chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005771 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005772 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005773 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005774 errors, &errorHandler,
5775 "unicodeescape", "illegal Unicode character",
5776 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005777 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005778 goto onError;
5779 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005780 break;
5781
Benjamin Peterson29060642009-01-31 22:14:21 +00005782 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005783 case 'N':
5784 message = "malformed \\N character escape";
5785 if (ucnhash_CAPI == NULL) {
5786 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005787 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5788 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005789 if (ucnhash_CAPI == NULL)
5790 goto ucnhashError;
5791 }
5792 if (*s == '{') {
5793 const char *start = s+1;
5794 /* look for the closing brace */
5795 while (*s != '}' && s < end)
5796 s++;
5797 if (s > start && s < end && *s == '}') {
5798 /* found a name. look it up in the unicode database */
5799 message = "unknown Unicode character name";
5800 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005801 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005802 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005803 goto store;
5804 }
5805 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005806 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005807 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005808 errors, &errorHandler,
5809 "unicodeescape", message,
5810 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005811 &v, &i))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005812 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005813 break;
5814
5815 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005816 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005817 message = "\\ at end of string";
5818 s--;
5819 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005820 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005821 errors, &errorHandler,
5822 "unicodeescape", message,
5823 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005824 &v, &i))
Walter Dörwald8c077222002-03-25 11:16:18 +00005825 goto onError;
5826 }
5827 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005828 WRITECHAR('\\');
5829 WRITECHAR(s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005830 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005831 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005832 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005833 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005834 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005835 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005836#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005837
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005838 if (PyUnicode_Resize(&v, i) < 0)
5839 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005840 Py_XDECREF(errorHandler);
5841 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005842 return unicode_result(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005843
Benjamin Peterson29060642009-01-31 22:14:21 +00005844 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005845 PyErr_SetString(
5846 PyExc_UnicodeError,
5847 "\\N escapes not supported (can't load unicodedata module)"
5848 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005849 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005850 Py_XDECREF(errorHandler);
5851 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005852 return NULL;
5853
Benjamin Peterson29060642009-01-31 22:14:21 +00005854 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005855 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005856 Py_XDECREF(errorHandler);
5857 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005858 return NULL;
5859}
5860
5861/* Return a Unicode-Escape string version of the Unicode object.
5862
5863 If quotes is true, the string is enclosed in u"" or u'' quotes as
5864 appropriate.
5865
5866*/
5867
Alexander Belopolsky40018472011-02-26 01:02:56 +00005868PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005869PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005870{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005871 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005872 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005873 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005874 int kind;
5875 void *data;
5876 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005877
Thomas Wouters89f507f2006-12-13 04:49:30 +00005878 /* Initial allocation is based on the longest-possible unichr
5879 escape.
5880
5881 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5882 unichr, so in this case it's the longest unichr escape. In
5883 narrow (UTF-16) builds this is five chars per source unichr
5884 since there are two unichrs in the surrogate pair, so in narrow
5885 (UTF-16) builds it's not the longest unichr escape.
5886
5887 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5888 so in the narrow (UTF-16) build case it's the longest unichr
5889 escape.
5890 */
5891
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005892 if (!PyUnicode_Check(unicode)) {
5893 PyErr_BadArgument();
5894 return NULL;
5895 }
5896 if (PyUnicode_READY(unicode) < 0)
5897 return NULL;
5898 len = PyUnicode_GET_LENGTH(unicode);
5899 kind = PyUnicode_KIND(unicode);
5900 data = PyUnicode_DATA(unicode);
5901 switch(kind) {
5902 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5903 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5904 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5905 }
5906
5907 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005908 return PyBytes_FromStringAndSize(NULL, 0);
5909
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005910 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005911 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005912
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005913 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005914 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005915 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005916 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005917 if (repr == NULL)
5918 return NULL;
5919
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005920 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005921
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005922 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005923 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005924
Walter Dörwald79e913e2007-05-12 11:08:06 +00005925 /* Escape backslashes */
5926 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005927 *p++ = '\\';
5928 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005929 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005930 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005931
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005932 /* Map 21-bit characters to '\U00xxxxxx' */
5933 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005934 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005935 *p++ = '\\';
5936 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005937 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5938 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5939 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5940 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5941 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5942 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5943 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5944 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005945 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005946 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005947
Guido van Rossumd57fd912000-03-10 22:53:23 +00005948 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005949 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005950 *p++ = '\\';
5951 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005952 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5953 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5954 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5955 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005956 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005957
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005958 /* Map special whitespace to '\t', \n', '\r' */
5959 else if (ch == '\t') {
5960 *p++ = '\\';
5961 *p++ = 't';
5962 }
5963 else if (ch == '\n') {
5964 *p++ = '\\';
5965 *p++ = 'n';
5966 }
5967 else if (ch == '\r') {
5968 *p++ = '\\';
5969 *p++ = 'r';
5970 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005971
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005972 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005973 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005974 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005975 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005976 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5977 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005978 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005979
Guido van Rossumd57fd912000-03-10 22:53:23 +00005980 /* Copy everything else as-is */
5981 else
5982 *p++ = (char) ch;
5983 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005984
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005985 assert(p - PyBytes_AS_STRING(repr) > 0);
5986 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5987 return NULL;
5988 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005989}
5990
Alexander Belopolsky40018472011-02-26 01:02:56 +00005991PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005992PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5993 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005994{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005995 PyObject *result;
5996 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5997 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005998 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005999 result = PyUnicode_AsUnicodeEscapeString(tmp);
6000 Py_DECREF(tmp);
6001 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006002}
6003
6004/* --- Raw Unicode Escape Codec ------------------------------------------- */
6005
Alexander Belopolsky40018472011-02-26 01:02:56 +00006006PyObject *
6007PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006008 Py_ssize_t size,
6009 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006010{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006011 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006012 Py_ssize_t startinpos;
6013 Py_ssize_t endinpos;
6014 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006015 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006016 const char *end;
6017 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006018 PyObject *errorHandler = NULL;
6019 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006020
Guido van Rossumd57fd912000-03-10 22:53:23 +00006021 /* Escaped strings will always be longer than the resulting
6022 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006023 length after conversion to the true value. (But decoding error
6024 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006025 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006026 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006027 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006028 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006029 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006030 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006031 end = s + size;
6032 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006033 unsigned char c;
6034 Py_UCS4 x;
6035 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006036 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006037
Benjamin Peterson29060642009-01-31 22:14:21 +00006038 /* Non-escape characters are interpreted as Unicode ordinals */
6039 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006040 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6041 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006042 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006043 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006044 startinpos = s-starts;
6045
6046 /* \u-escapes are only interpreted iff the number of leading
6047 backslashes if odd */
6048 bs = s;
6049 for (;s < end;) {
6050 if (*s != '\\')
6051 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006052 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6053 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006054 }
6055 if (((s - bs) & 1) == 0 ||
6056 s >= end ||
6057 (*s != 'u' && *s != 'U')) {
6058 continue;
6059 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006060 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006061 count = *s=='u' ? 4 : 8;
6062 s++;
6063
6064 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006065 for (x = 0, i = 0; i < count; ++i, ++s) {
6066 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006067 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006068 endinpos = s-starts;
6069 if (unicode_decode_call_errorhandler(
6070 errors, &errorHandler,
6071 "rawunicodeescape", "truncated \\uXXXX",
6072 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006073 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006074 goto onError;
6075 goto nextByte;
6076 }
6077 x = (x<<4) & ~0xF;
6078 if (c >= '0' && c <= '9')
6079 x += c - '0';
6080 else if (c >= 'a' && c <= 'f')
6081 x += 10 + c - 'a';
6082 else
6083 x += 10 + c - 'A';
6084 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006085 if (x <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006086 if (unicode_putchar(&v, &outpos, x) < 0)
6087 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006088 } else {
6089 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006090 if (unicode_decode_call_errorhandler(
6091 errors, &errorHandler,
6092 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006093 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006094 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006095 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006096 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006097 nextByte:
6098 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006099 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006100 if (PyUnicode_Resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006101 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006102 Py_XDECREF(errorHandler);
6103 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006104 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00006105
Benjamin Peterson29060642009-01-31 22:14:21 +00006106 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006107 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006108 Py_XDECREF(errorHandler);
6109 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006110 return NULL;
6111}
6112
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006113
Alexander Belopolsky40018472011-02-26 01:02:56 +00006114PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006115PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006116{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006117 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006118 char *p;
6119 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006120 Py_ssize_t expandsize, pos;
6121 int kind;
6122 void *data;
6123 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006124
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006125 if (!PyUnicode_Check(unicode)) {
6126 PyErr_BadArgument();
6127 return NULL;
6128 }
6129 if (PyUnicode_READY(unicode) < 0)
6130 return NULL;
6131 kind = PyUnicode_KIND(unicode);
6132 data = PyUnicode_DATA(unicode);
6133 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006134 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6135 bytes, and 1 byte characters 4. */
6136 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006137
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006138 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006139 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006140
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006141 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006142 if (repr == NULL)
6143 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006144 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006145 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006146
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006147 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006148 for (pos = 0; pos < len; pos++) {
6149 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006150 /* Map 32-bit characters to '\Uxxxxxxxx' */
6151 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006152 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006153 *p++ = '\\';
6154 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006155 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6156 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6157 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6158 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6159 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6160 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6161 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6162 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006163 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006164 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006165 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006166 *p++ = '\\';
6167 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006168 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6169 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6170 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6171 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006172 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006173 /* Copy everything else as-is */
6174 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006175 *p++ = (char) ch;
6176 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006177
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006178 assert(p > q);
6179 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006180 return NULL;
6181 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006182}
6183
Alexander Belopolsky40018472011-02-26 01:02:56 +00006184PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006185PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6186 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006187{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006188 PyObject *result;
6189 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6190 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006191 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006192 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6193 Py_DECREF(tmp);
6194 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006195}
6196
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006197/* --- Unicode Internal Codec ------------------------------------------- */
6198
Alexander Belopolsky40018472011-02-26 01:02:56 +00006199PyObject *
6200_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006201 Py_ssize_t size,
6202 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006203{
6204 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006205 Py_ssize_t startinpos;
6206 Py_ssize_t endinpos;
6207 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006208 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006209 const char *end;
6210 const char *reason;
6211 PyObject *errorHandler = NULL;
6212 PyObject *exc = NULL;
6213
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006214 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006215 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006216 1))
6217 return NULL;
6218
Thomas Wouters89f507f2006-12-13 04:49:30 +00006219 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006220 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006221 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006222 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006223 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006224 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006225 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006226 end = s + size;
6227
6228 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006229 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006230 Py_UCS4 ch;
6231 /* We copy the raw representation one byte at a time because the
6232 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006233 ((char *) &uch)[0] = s[0];
6234 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006235#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006236 ((char *) &uch)[2] = s[2];
6237 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006238#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006239 ch = uch;
6240
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006241 /* We have to sanity check the raw data, otherwise doom looms for
6242 some malformed UCS-4 data. */
6243 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006244#ifdef Py_UNICODE_WIDE
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006245 ch > 0x10ffff ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006246#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006247 end-s < Py_UNICODE_SIZE
6248 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006249 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006250 startinpos = s - starts;
6251 if (end-s < Py_UNICODE_SIZE) {
6252 endinpos = end-starts;
6253 reason = "truncated input";
6254 }
6255 else {
6256 endinpos = s - starts + Py_UNICODE_SIZE;
6257 reason = "illegal code point (> 0x10FFFF)";
6258 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006259 if (unicode_decode_call_errorhandler(
6260 errors, &errorHandler,
6261 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006262 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006263 &v, &outpos))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006264 goto onError;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006265 continue;
6266 }
6267
6268 s += Py_UNICODE_SIZE;
6269#ifndef Py_UNICODE_WIDE
Victor Stinner551ac952011-11-29 22:58:13 +01006270 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006271 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006272 Py_UNICODE uch2;
6273 ((char *) &uch2)[0] = s[0];
6274 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006275 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006276 {
Victor Stinner551ac952011-11-29 22:58:13 +01006277 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006278 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006279 }
6280 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006281#endif
6282
6283 if (unicode_putchar(&v, &outpos, ch) < 0)
6284 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006285 }
6286
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006287 if (PyUnicode_Resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006288 goto onError;
6289 Py_XDECREF(errorHandler);
6290 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006291 return unicode_result(v);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006292
Benjamin Peterson29060642009-01-31 22:14:21 +00006293 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006294 Py_XDECREF(v);
6295 Py_XDECREF(errorHandler);
6296 Py_XDECREF(exc);
6297 return NULL;
6298}
6299
Guido van Rossumd57fd912000-03-10 22:53:23 +00006300/* --- Latin-1 Codec ------------------------------------------------------ */
6301
Alexander Belopolsky40018472011-02-26 01:02:56 +00006302PyObject *
6303PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006304 Py_ssize_t size,
6305 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006306{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006307 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006308 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006309}
6310
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006311/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006312static void
6313make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006314 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006315 PyObject *unicode,
6316 Py_ssize_t startpos, Py_ssize_t endpos,
6317 const char *reason)
6318{
6319 if (*exceptionObject == NULL) {
6320 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006321 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006322 encoding, unicode, startpos, endpos, reason);
6323 }
6324 else {
6325 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6326 goto onError;
6327 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6328 goto onError;
6329 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6330 goto onError;
6331 return;
6332 onError:
6333 Py_DECREF(*exceptionObject);
6334 *exceptionObject = NULL;
6335 }
6336}
6337
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006338/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006339static void
6340raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006341 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006342 PyObject *unicode,
6343 Py_ssize_t startpos, Py_ssize_t endpos,
6344 const char *reason)
6345{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006346 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006347 encoding, unicode, startpos, endpos, reason);
6348 if (*exceptionObject != NULL)
6349 PyCodec_StrictErrors(*exceptionObject);
6350}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006351
6352/* error handling callback helper:
6353 build arguments, call the callback and check the arguments,
6354 put the result into newpos and return the replacement string, which
6355 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006356static PyObject *
6357unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006358 PyObject **errorHandler,
6359 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006360 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006361 Py_ssize_t startpos, Py_ssize_t endpos,
6362 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006363{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006364 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006365 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006366 PyObject *restuple;
6367 PyObject *resunicode;
6368
6369 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006370 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006371 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006372 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006373 }
6374
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006375 if (PyUnicode_READY(unicode) < 0)
6376 return NULL;
6377 len = PyUnicode_GET_LENGTH(unicode);
6378
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006379 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006380 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006381 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006382 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006383
6384 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006385 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006386 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006387 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006388 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006389 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006390 Py_DECREF(restuple);
6391 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006392 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006393 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006394 &resunicode, newpos)) {
6395 Py_DECREF(restuple);
6396 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006397 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006398 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6399 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6400 Py_DECREF(restuple);
6401 return NULL;
6402 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006403 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006404 *newpos = len + *newpos;
6405 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006406 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6407 Py_DECREF(restuple);
6408 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006409 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006410 Py_INCREF(resunicode);
6411 Py_DECREF(restuple);
6412 return resunicode;
6413}
6414
Alexander Belopolsky40018472011-02-26 01:02:56 +00006415static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006416unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006417 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006418 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006419{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006420 /* input state */
6421 Py_ssize_t pos=0, size;
6422 int kind;
6423 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006424 /* output object */
6425 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006426 /* pointer into the output */
6427 char *str;
6428 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006429 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006430 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6431 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006432 PyObject *errorHandler = NULL;
6433 PyObject *exc = NULL;
6434 /* the following variable is used for caching string comparisons
6435 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6436 int known_errorHandler = -1;
6437
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006438 if (PyUnicode_READY(unicode) < 0)
6439 return NULL;
6440 size = PyUnicode_GET_LENGTH(unicode);
6441 kind = PyUnicode_KIND(unicode);
6442 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006443 /* allocate enough for a simple encoding without
6444 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006445 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006446 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006447 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006448 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006449 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006450 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006451 ressize = size;
6452
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006453 while (pos < size) {
6454 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006455
Benjamin Peterson29060642009-01-31 22:14:21 +00006456 /* can we encode this? */
6457 if (c<limit) {
6458 /* no overflow check, because we know that the space is enough */
6459 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006460 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006461 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006462 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006463 Py_ssize_t requiredsize;
6464 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006465 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006466 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006467 Py_ssize_t collstart = pos;
6468 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006469 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006470 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006471 ++collend;
6472 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6473 if (known_errorHandler==-1) {
6474 if ((errors==NULL) || (!strcmp(errors, "strict")))
6475 known_errorHandler = 1;
6476 else if (!strcmp(errors, "replace"))
6477 known_errorHandler = 2;
6478 else if (!strcmp(errors, "ignore"))
6479 known_errorHandler = 3;
6480 else if (!strcmp(errors, "xmlcharrefreplace"))
6481 known_errorHandler = 4;
6482 else
6483 known_errorHandler = 0;
6484 }
6485 switch (known_errorHandler) {
6486 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006487 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006488 goto onError;
6489 case 2: /* replace */
6490 while (collstart++<collend)
6491 *str++ = '?'; /* fall through */
6492 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006493 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006494 break;
6495 case 4: /* xmlcharrefreplace */
6496 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006497 /* determine replacement size */
6498 for (i = collstart, repsize = 0; i < collend; ++i) {
6499 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6500 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006501 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006502 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006503 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006504 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006505 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006506 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006507 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006508 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006509 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006510 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006511 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006512 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006513 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006514 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006515 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006516 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006517 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006518 if (requiredsize > ressize) {
6519 if (requiredsize<2*ressize)
6520 requiredsize = 2*ressize;
6521 if (_PyBytes_Resize(&res, requiredsize))
6522 goto onError;
6523 str = PyBytes_AS_STRING(res) + respos;
6524 ressize = requiredsize;
6525 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006526 /* generate replacement */
6527 for (i = collstart; i < collend; ++i) {
6528 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006529 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006530 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006531 break;
6532 default:
6533 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006534 encoding, reason, unicode, &exc,
6535 collstart, collend, &newpos);
6536 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
6537 PyUnicode_READY(repunicode) < 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00006538 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006539 if (PyBytes_Check(repunicode)) {
6540 /* Directly copy bytes result to output. */
6541 repsize = PyBytes_Size(repunicode);
6542 if (repsize > 1) {
6543 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006544 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006545 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6546 Py_DECREF(repunicode);
6547 goto onError;
6548 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006549 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006550 ressize += repsize-1;
6551 }
6552 memcpy(str, PyBytes_AsString(repunicode), repsize);
6553 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006554 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006555 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006556 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006557 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006558 /* need more space? (at least enough for what we
6559 have+the replacement+the rest of the string, so
6560 we won't have to check space for encodable characters) */
6561 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006562 repsize = PyUnicode_GET_LENGTH(repunicode);
6563 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006564 if (requiredsize > ressize) {
6565 if (requiredsize<2*ressize)
6566 requiredsize = 2*ressize;
6567 if (_PyBytes_Resize(&res, requiredsize)) {
6568 Py_DECREF(repunicode);
6569 goto onError;
6570 }
6571 str = PyBytes_AS_STRING(res) + respos;
6572 ressize = requiredsize;
6573 }
6574 /* check if there is anything unencodable in the replacement
6575 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006576 for (i = 0; repsize-->0; ++i, ++str) {
6577 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006578 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006579 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006580 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006581 Py_DECREF(repunicode);
6582 goto onError;
6583 }
6584 *str = (char)c;
6585 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006586 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006587 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006588 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006589 }
6590 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006591 /* Resize if we allocated to much */
6592 size = str - PyBytes_AS_STRING(res);
6593 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006594 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006595 if (_PyBytes_Resize(&res, size) < 0)
6596 goto onError;
6597 }
6598
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006599 Py_XDECREF(errorHandler);
6600 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006601 return res;
6602
6603 onError:
6604 Py_XDECREF(res);
6605 Py_XDECREF(errorHandler);
6606 Py_XDECREF(exc);
6607 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006608}
6609
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006610/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006611PyObject *
6612PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006613 Py_ssize_t size,
6614 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006615{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006616 PyObject *result;
6617 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6618 if (unicode == NULL)
6619 return NULL;
6620 result = unicode_encode_ucs1(unicode, errors, 256);
6621 Py_DECREF(unicode);
6622 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006623}
6624
Alexander Belopolsky40018472011-02-26 01:02:56 +00006625PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006626_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006627{
6628 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006629 PyErr_BadArgument();
6630 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006631 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006632 if (PyUnicode_READY(unicode) == -1)
6633 return NULL;
6634 /* Fast path: if it is a one-byte string, construct
6635 bytes object directly. */
6636 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6637 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6638 PyUnicode_GET_LENGTH(unicode));
6639 /* Non-Latin-1 characters present. Defer to above function to
6640 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006641 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006642}
6643
6644PyObject*
6645PyUnicode_AsLatin1String(PyObject *unicode)
6646{
6647 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006648}
6649
6650/* --- 7-bit ASCII Codec -------------------------------------------------- */
6651
Alexander Belopolsky40018472011-02-26 01:02:56 +00006652PyObject *
6653PyUnicode_DecodeASCII(const char *s,
6654 Py_ssize_t size,
6655 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006656{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006657 const char *starts = s;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006658 PyObject *v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006659 int kind;
6660 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006661 Py_ssize_t startinpos;
6662 Py_ssize_t endinpos;
6663 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006664 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006665 int has_error;
6666 const unsigned char *p = (const unsigned char *)s;
6667 const unsigned char *end = p + size;
6668 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006669 PyObject *errorHandler = NULL;
6670 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006671
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006672 if (size == 0) {
6673 Py_INCREF(unicode_empty);
6674 return unicode_empty;
6675 }
6676
Guido van Rossumd57fd912000-03-10 22:53:23 +00006677 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006678 if (size == 1 && (unsigned char)s[0] < 128)
6679 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006680
Victor Stinner702c7342011-10-05 13:50:52 +02006681 has_error = 0;
6682 while (p < end && !has_error) {
6683 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6684 an explanation. */
6685 if (!((size_t) p & LONG_PTR_MASK)) {
6686 /* Help register allocation */
6687 register const unsigned char *_p = p;
6688 while (_p < aligned_end) {
6689 unsigned long value = *(unsigned long *) _p;
6690 if (value & ASCII_CHAR_MASK) {
6691 has_error = 1;
6692 break;
6693 }
6694 _p += SIZEOF_LONG;
6695 }
6696 if (_p == end)
6697 break;
6698 if (has_error)
6699 break;
6700 p = _p;
6701 }
6702 if (*p & 0x80) {
6703 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006704 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006705 }
6706 else {
6707 ++p;
6708 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006709 }
Victor Stinner702c7342011-10-05 13:50:52 +02006710 if (!has_error)
6711 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006712
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006713 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006714 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006715 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006716 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006717 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006718 kind = PyUnicode_KIND(v);
6719 data = PyUnicode_DATA(v);
6720 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006721 e = s + size;
6722 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006723 register unsigned char c = (unsigned char)*s;
6724 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006725 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006726 ++s;
6727 }
6728 else {
6729 startinpos = s-starts;
6730 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006731 if (unicode_decode_call_errorhandler(
6732 errors, &errorHandler,
6733 "ascii", "ordinal not in range(128)",
6734 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006735 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006736 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006737 kind = PyUnicode_KIND(v);
6738 data = PyUnicode_DATA(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006739 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006740 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006741 if (PyUnicode_Resize(&v, outpos) < 0)
6742 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006743 Py_XDECREF(errorHandler);
6744 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006745 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01006746 return v;
Tim Petersced69f82003-09-16 20:30:58 +00006747
Benjamin Peterson29060642009-01-31 22:14:21 +00006748 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006749 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006750 Py_XDECREF(errorHandler);
6751 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006752 return NULL;
6753}
6754
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006755/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006756PyObject *
6757PyUnicode_EncodeASCII(const Py_UNICODE *p,
6758 Py_ssize_t size,
6759 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006760{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006761 PyObject *result;
6762 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6763 if (unicode == NULL)
6764 return NULL;
6765 result = unicode_encode_ucs1(unicode, errors, 128);
6766 Py_DECREF(unicode);
6767 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006768}
6769
Alexander Belopolsky40018472011-02-26 01:02:56 +00006770PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006771_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006772{
6773 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006774 PyErr_BadArgument();
6775 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006776 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006777 if (PyUnicode_READY(unicode) == -1)
6778 return NULL;
6779 /* Fast path: if it is an ASCII-only string, construct bytes object
6780 directly. Else defer to above function to raise the exception. */
6781 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6782 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6783 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006784 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006785}
6786
6787PyObject *
6788PyUnicode_AsASCIIString(PyObject *unicode)
6789{
6790 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006791}
6792
Victor Stinner99b95382011-07-04 14:23:54 +02006793#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006794
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006795/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006796
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006797#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006798#define NEED_RETRY
6799#endif
6800
Victor Stinner3a50e702011-10-18 21:21:00 +02006801#ifndef WC_ERR_INVALID_CHARS
6802# define WC_ERR_INVALID_CHARS 0x0080
6803#endif
6804
6805static char*
6806code_page_name(UINT code_page, PyObject **obj)
6807{
6808 *obj = NULL;
6809 if (code_page == CP_ACP)
6810 return "mbcs";
6811 if (code_page == CP_UTF7)
6812 return "CP_UTF7";
6813 if (code_page == CP_UTF8)
6814 return "CP_UTF8";
6815
6816 *obj = PyBytes_FromFormat("cp%u", code_page);
6817 if (*obj == NULL)
6818 return NULL;
6819 return PyBytes_AS_STRING(*obj);
6820}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006821
Alexander Belopolsky40018472011-02-26 01:02:56 +00006822static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006823is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006824{
6825 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006826 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006827
Victor Stinner3a50e702011-10-18 21:21:00 +02006828 if (!IsDBCSLeadByteEx(code_page, *curr))
6829 return 0;
6830
6831 prev = CharPrevExA(code_page, s, curr, 0);
6832 if (prev == curr)
6833 return 1;
6834 /* FIXME: This code is limited to "true" double-byte encodings,
6835 as it assumes an incomplete character consists of a single
6836 byte. */
6837 if (curr - prev == 2)
6838 return 1;
6839 if (!IsDBCSLeadByteEx(code_page, *prev))
6840 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006841 return 0;
6842}
6843
Victor Stinner3a50e702011-10-18 21:21:00 +02006844static DWORD
6845decode_code_page_flags(UINT code_page)
6846{
6847 if (code_page == CP_UTF7) {
6848 /* The CP_UTF7 decoder only supports flags=0 */
6849 return 0;
6850 }
6851 else
6852 return MB_ERR_INVALID_CHARS;
6853}
6854
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006855/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006856 * Decode a byte string from a Windows code page into unicode object in strict
6857 * mode.
6858 *
6859 * Returns consumed size if succeed, returns -2 on decode error, or raise a
6860 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006861 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006862static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006863decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006864 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006865 const char *in,
6866 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006867{
Victor Stinner3a50e702011-10-18 21:21:00 +02006868 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006869 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006870 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006871
6872 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006873 assert(insize > 0);
6874 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6875 if (outsize <= 0)
6876 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006877
6878 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006879 /* Create unicode object */
Victor Stinner76a31a62011-11-04 00:05:13 +01006880 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006881 if (*v == NULL)
6882 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006883 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006884 }
6885 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006886 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006887 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner76a31a62011-11-04 00:05:13 +01006888 if (PyUnicode_Resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006889 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006890 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006891 }
6892
6893 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006894 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6895 if (outsize <= 0)
6896 goto error;
6897 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006898
Victor Stinner3a50e702011-10-18 21:21:00 +02006899error:
6900 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6901 return -2;
6902 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006903 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006904}
6905
Victor Stinner3a50e702011-10-18 21:21:00 +02006906/*
6907 * Decode a byte string from a code page into unicode object with an error
6908 * handler.
6909 *
6910 * Returns consumed size if succeed, or raise a WindowsError or
6911 * UnicodeDecodeError exception and returns -1 on error.
6912 */
6913static int
6914decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006915 PyObject **v,
6916 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006917 const char *errors)
6918{
6919 const char *startin = in;
6920 const char *endin = in + size;
6921 const DWORD flags = decode_code_page_flags(code_page);
6922 /* Ideally, we should get reason from FormatMessage. This is the Windows
6923 2000 English version of the message. */
6924 const char *reason = "No mapping for the Unicode character exists "
6925 "in the target code page.";
6926 /* each step cannot decode more than 1 character, but a character can be
6927 represented as a surrogate pair */
6928 wchar_t buffer[2], *startout, *out;
6929 int insize, outsize;
6930 PyObject *errorHandler = NULL;
6931 PyObject *exc = NULL;
6932 PyObject *encoding_obj = NULL;
6933 char *encoding;
6934 DWORD err;
6935 int ret = -1;
6936
6937 assert(size > 0);
6938
6939 encoding = code_page_name(code_page, &encoding_obj);
6940 if (encoding == NULL)
6941 return -1;
6942
6943 if (errors == NULL || strcmp(errors, "strict") == 0) {
6944 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6945 UnicodeDecodeError. */
6946 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6947 if (exc != NULL) {
6948 PyCodec_StrictErrors(exc);
6949 Py_CLEAR(exc);
6950 }
6951 goto error;
6952 }
6953
6954 if (*v == NULL) {
6955 /* Create unicode object */
6956 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6957 PyErr_NoMemory();
6958 goto error;
6959 }
Victor Stinner76a31a62011-11-04 00:05:13 +01006960 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006961 if (*v == NULL)
6962 goto error;
6963 startout = PyUnicode_AS_UNICODE(*v);
6964 }
6965 else {
6966 /* Extend unicode object */
6967 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6968 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6969 PyErr_NoMemory();
6970 goto error;
6971 }
Victor Stinner76a31a62011-11-04 00:05:13 +01006972 if (PyUnicode_Resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006973 goto error;
6974 startout = PyUnicode_AS_UNICODE(*v) + n;
6975 }
6976
6977 /* Decode the byte string character per character */
6978 out = startout;
6979 while (in < endin)
6980 {
6981 /* Decode a character */
6982 insize = 1;
6983 do
6984 {
6985 outsize = MultiByteToWideChar(code_page, flags,
6986 in, insize,
6987 buffer, Py_ARRAY_LENGTH(buffer));
6988 if (outsize > 0)
6989 break;
6990 err = GetLastError();
6991 if (err != ERROR_NO_UNICODE_TRANSLATION
6992 && err != ERROR_INSUFFICIENT_BUFFER)
6993 {
6994 PyErr_SetFromWindowsErr(0);
6995 goto error;
6996 }
6997 insize++;
6998 }
6999 /* 4=maximum length of a UTF-8 sequence */
7000 while (insize <= 4 && (in + insize) <= endin);
7001
7002 if (outsize <= 0) {
7003 Py_ssize_t startinpos, endinpos, outpos;
7004
7005 startinpos = in - startin;
7006 endinpos = startinpos + 1;
7007 outpos = out - PyUnicode_AS_UNICODE(*v);
7008 if (unicode_decode_call_errorhandler(
7009 errors, &errorHandler,
7010 encoding, reason,
7011 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007012 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007013 {
7014 goto error;
7015 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007016 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007017 }
7018 else {
7019 in += insize;
7020 memcpy(out, buffer, outsize * sizeof(wchar_t));
7021 out += outsize;
7022 }
7023 }
7024
7025 /* write a NUL character at the end */
7026 *out = 0;
7027
7028 /* Extend unicode object */
7029 outsize = out - startout;
7030 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner76a31a62011-11-04 00:05:13 +01007031 if (PyUnicode_Resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007032 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01007033 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007034
7035error:
7036 Py_XDECREF(encoding_obj);
7037 Py_XDECREF(errorHandler);
7038 Py_XDECREF(exc);
7039 return ret;
7040}
7041
Victor Stinner3a50e702011-10-18 21:21:00 +02007042static PyObject *
7043decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007044 const char *s, Py_ssize_t size,
7045 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007046{
Victor Stinner76a31a62011-11-04 00:05:13 +01007047 PyObject *v = NULL;
7048 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007049
Victor Stinner3a50e702011-10-18 21:21:00 +02007050 if (code_page < 0) {
7051 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7052 return NULL;
7053 }
7054
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007055 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007056 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007057
Victor Stinner76a31a62011-11-04 00:05:13 +01007058 do
7059 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007060#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007061 if (size > INT_MAX) {
7062 chunk_size = INT_MAX;
7063 final = 0;
7064 done = 0;
7065 }
7066 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007067#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007068 {
7069 chunk_size = (int)size;
7070 final = (consumed == NULL);
7071 done = 1;
7072 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007073
Victor Stinner76a31a62011-11-04 00:05:13 +01007074 /* Skip trailing lead-byte unless 'final' is set */
7075 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
7076 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007077
Victor Stinner76a31a62011-11-04 00:05:13 +01007078 if (chunk_size == 0 && done) {
7079 if (v != NULL)
7080 break;
7081 Py_INCREF(unicode_empty);
7082 return unicode_empty;
7083 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007084
Victor Stinner76a31a62011-11-04 00:05:13 +01007085
7086 converted = decode_code_page_strict(code_page, &v,
7087 s, chunk_size);
7088 if (converted == -2)
7089 converted = decode_code_page_errors(code_page, &v,
7090 s, chunk_size,
7091 errors);
7092 assert(converted != 0);
7093
7094 if (converted < 0) {
7095 Py_XDECREF(v);
7096 return NULL;
7097 }
7098
7099 if (consumed)
7100 *consumed += converted;
7101
7102 s += converted;
7103 size -= converted;
7104 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007105
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007106 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007107}
7108
Alexander Belopolsky40018472011-02-26 01:02:56 +00007109PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007110PyUnicode_DecodeCodePageStateful(int code_page,
7111 const char *s,
7112 Py_ssize_t size,
7113 const char *errors,
7114 Py_ssize_t *consumed)
7115{
7116 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7117}
7118
7119PyObject *
7120PyUnicode_DecodeMBCSStateful(const char *s,
7121 Py_ssize_t size,
7122 const char *errors,
7123 Py_ssize_t *consumed)
7124{
7125 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7126}
7127
7128PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007129PyUnicode_DecodeMBCS(const char *s,
7130 Py_ssize_t size,
7131 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007132{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007133 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7134}
7135
Victor Stinner3a50e702011-10-18 21:21:00 +02007136static DWORD
7137encode_code_page_flags(UINT code_page, const char *errors)
7138{
7139 if (code_page == CP_UTF8) {
7140 if (winver.dwMajorVersion >= 6)
7141 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7142 and later */
7143 return WC_ERR_INVALID_CHARS;
7144 else
7145 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7146 return 0;
7147 }
7148 else if (code_page == CP_UTF7) {
7149 /* CP_UTF7 only supports flags=0 */
7150 return 0;
7151 }
7152 else {
7153 if (errors != NULL && strcmp(errors, "replace") == 0)
7154 return 0;
7155 else
7156 return WC_NO_BEST_FIT_CHARS;
7157 }
7158}
7159
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007160/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007161 * Encode a Unicode string to a Windows code page into a byte string in strict
7162 * mode.
7163 *
7164 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7165 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007166 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007167static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007168encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007169 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007170 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007171{
Victor Stinner554f3f02010-06-16 23:33:54 +00007172 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007173 BOOL *pusedDefaultChar = &usedDefaultChar;
7174 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007175 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007176 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007177 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007178 const DWORD flags = encode_code_page_flags(code_page, NULL);
7179 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007180 /* Create a substring so that we can get the UTF-16 representation
7181 of just the slice under consideration. */
7182 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007183
Martin v. Löwis3d325192011-11-04 18:23:06 +01007184 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007185
Victor Stinner3a50e702011-10-18 21:21:00 +02007186 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007187 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007188 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007189 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007190
Victor Stinner2fc507f2011-11-04 20:06:39 +01007191 substring = PyUnicode_Substring(unicode, offset, offset+len);
7192 if (substring == NULL)
7193 return -1;
7194 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7195 if (p == NULL) {
7196 Py_DECREF(substring);
7197 return -1;
7198 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01007199
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007200 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007201 outsize = WideCharToMultiByte(code_page, flags,
7202 p, size,
7203 NULL, 0,
7204 NULL, pusedDefaultChar);
7205 if (outsize <= 0)
7206 goto error;
7207 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007208 if (pusedDefaultChar && *pusedDefaultChar) {
7209 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007210 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007211 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007212
Victor Stinner3a50e702011-10-18 21:21:00 +02007213 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007214 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007215 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007216 if (*outbytes == NULL) {
7217 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007218 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007219 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007220 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007221 }
7222 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007223 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007224 const Py_ssize_t n = PyBytes_Size(*outbytes);
7225 if (outsize > PY_SSIZE_T_MAX - n) {
7226 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007227 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007228 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007229 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007230 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7231 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007232 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007233 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007234 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007235 }
7236
7237 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007238 outsize = WideCharToMultiByte(code_page, flags,
7239 p, size,
7240 out, outsize,
7241 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007242 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007243 if (outsize <= 0)
7244 goto error;
7245 if (pusedDefaultChar && *pusedDefaultChar)
7246 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007247 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007248
Victor Stinner3a50e702011-10-18 21:21:00 +02007249error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007250 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007251 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7252 return -2;
7253 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007254 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007255}
7256
Victor Stinner3a50e702011-10-18 21:21:00 +02007257/*
7258 * Encode a Unicode string to a Windows code page into a byte string using a
7259 * error handler.
7260 *
7261 * Returns consumed characters if succeed, or raise a WindowsError and returns
7262 * -1 on other error.
7263 */
7264static int
7265encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007266 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007267 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007268{
Victor Stinner3a50e702011-10-18 21:21:00 +02007269 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007270 Py_ssize_t pos = unicode_offset;
7271 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007272 /* Ideally, we should get reason from FormatMessage. This is the Windows
7273 2000 English version of the message. */
7274 const char *reason = "invalid character";
7275 /* 4=maximum length of a UTF-8 sequence */
7276 char buffer[4];
7277 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7278 Py_ssize_t outsize;
7279 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007280 PyObject *errorHandler = NULL;
7281 PyObject *exc = NULL;
7282 PyObject *encoding_obj = NULL;
7283 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007284 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007285 PyObject *rep;
7286 int ret = -1;
7287
7288 assert(insize > 0);
7289
7290 encoding = code_page_name(code_page, &encoding_obj);
7291 if (encoding == NULL)
7292 return -1;
7293
7294 if (errors == NULL || strcmp(errors, "strict") == 0) {
7295 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7296 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007297 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007298 if (exc != NULL) {
7299 PyCodec_StrictErrors(exc);
7300 Py_DECREF(exc);
7301 }
7302 Py_XDECREF(encoding_obj);
7303 return -1;
7304 }
7305
7306 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7307 pusedDefaultChar = &usedDefaultChar;
7308 else
7309 pusedDefaultChar = NULL;
7310
7311 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7312 PyErr_NoMemory();
7313 goto error;
7314 }
7315 outsize = insize * Py_ARRAY_LENGTH(buffer);
7316
7317 if (*outbytes == NULL) {
7318 /* Create string object */
7319 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7320 if (*outbytes == NULL)
7321 goto error;
7322 out = PyBytes_AS_STRING(*outbytes);
7323 }
7324 else {
7325 /* Extend string object */
7326 Py_ssize_t n = PyBytes_Size(*outbytes);
7327 if (n > PY_SSIZE_T_MAX - outsize) {
7328 PyErr_NoMemory();
7329 goto error;
7330 }
7331 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7332 goto error;
7333 out = PyBytes_AS_STRING(*outbytes) + n;
7334 }
7335
7336 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007337 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007338 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007339 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7340 wchar_t chars[2];
7341 int charsize;
7342 if (ch < 0x10000) {
7343 chars[0] = (wchar_t)ch;
7344 charsize = 1;
7345 }
7346 else {
7347 ch -= 0x10000;
7348 chars[0] = 0xd800 + (ch >> 10);
7349 chars[1] = 0xdc00 + (ch & 0x3ff);
7350 charsize = 2;
7351 }
7352
Victor Stinner3a50e702011-10-18 21:21:00 +02007353 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007354 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007355 buffer, Py_ARRAY_LENGTH(buffer),
7356 NULL, pusedDefaultChar);
7357 if (outsize > 0) {
7358 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7359 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007360 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007361 memcpy(out, buffer, outsize);
7362 out += outsize;
7363 continue;
7364 }
7365 }
7366 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7367 PyErr_SetFromWindowsErr(0);
7368 goto error;
7369 }
7370
Victor Stinner3a50e702011-10-18 21:21:00 +02007371 rep = unicode_encode_call_errorhandler(
7372 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007373 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007374 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007375 if (rep == NULL)
7376 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007377 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007378
7379 if (PyBytes_Check(rep)) {
7380 outsize = PyBytes_GET_SIZE(rep);
7381 if (outsize != 1) {
7382 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7383 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7384 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7385 Py_DECREF(rep);
7386 goto error;
7387 }
7388 out = PyBytes_AS_STRING(*outbytes) + offset;
7389 }
7390 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7391 out += outsize;
7392 }
7393 else {
7394 Py_ssize_t i;
7395 enum PyUnicode_Kind kind;
7396 void *data;
7397
7398 if (PyUnicode_READY(rep) < 0) {
7399 Py_DECREF(rep);
7400 goto error;
7401 }
7402
7403 outsize = PyUnicode_GET_LENGTH(rep);
7404 if (outsize != 1) {
7405 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7406 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7407 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7408 Py_DECREF(rep);
7409 goto error;
7410 }
7411 out = PyBytes_AS_STRING(*outbytes) + offset;
7412 }
7413 kind = PyUnicode_KIND(rep);
7414 data = PyUnicode_DATA(rep);
7415 for (i=0; i < outsize; i++) {
7416 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7417 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007418 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007419 encoding, unicode,
7420 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007421 "unable to encode error handler result to ASCII");
7422 Py_DECREF(rep);
7423 goto error;
7424 }
7425 *out = (unsigned char)ch;
7426 out++;
7427 }
7428 }
7429 Py_DECREF(rep);
7430 }
7431 /* write a NUL byte */
7432 *out = 0;
7433 outsize = out - PyBytes_AS_STRING(*outbytes);
7434 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7435 if (_PyBytes_Resize(outbytes, outsize) < 0)
7436 goto error;
7437 ret = 0;
7438
7439error:
7440 Py_XDECREF(encoding_obj);
7441 Py_XDECREF(errorHandler);
7442 Py_XDECREF(exc);
7443 return ret;
7444}
7445
Victor Stinner3a50e702011-10-18 21:21:00 +02007446static PyObject *
7447encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007448 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007449 const char *errors)
7450{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007451 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007452 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007453 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007454 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007455
Victor Stinner2fc507f2011-11-04 20:06:39 +01007456 if (PyUnicode_READY(unicode) < 0)
7457 return NULL;
7458 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007459
Victor Stinner3a50e702011-10-18 21:21:00 +02007460 if (code_page < 0) {
7461 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7462 return NULL;
7463 }
7464
Martin v. Löwis3d325192011-11-04 18:23:06 +01007465 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007466 return PyBytes_FromStringAndSize(NULL, 0);
7467
Victor Stinner7581cef2011-11-03 22:32:33 +01007468 offset = 0;
7469 do
7470 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007471#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007472 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007473 chunks. */
7474 if (len > INT_MAX/2) {
7475 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007476 done = 0;
7477 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007478 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007479#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007480 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007481 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007482 done = 1;
7483 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007484
Victor Stinner76a31a62011-11-04 00:05:13 +01007485 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007486 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007487 errors);
7488 if (ret == -2)
7489 ret = encode_code_page_errors(code_page, &outbytes,
7490 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007491 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007492 if (ret < 0) {
7493 Py_XDECREF(outbytes);
7494 return NULL;
7495 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007496
Victor Stinner7581cef2011-11-03 22:32:33 +01007497 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007498 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007499 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007500
Victor Stinner3a50e702011-10-18 21:21:00 +02007501 return outbytes;
7502}
7503
7504PyObject *
7505PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7506 Py_ssize_t size,
7507 const char *errors)
7508{
Victor Stinner7581cef2011-11-03 22:32:33 +01007509 PyObject *unicode, *res;
7510 unicode = PyUnicode_FromUnicode(p, size);
7511 if (unicode == NULL)
7512 return NULL;
7513 res = encode_code_page(CP_ACP, unicode, errors);
7514 Py_DECREF(unicode);
7515 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007516}
7517
7518PyObject *
7519PyUnicode_EncodeCodePage(int code_page,
7520 PyObject *unicode,
7521 const char *errors)
7522{
Victor Stinner7581cef2011-11-03 22:32:33 +01007523 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007524}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007525
Alexander Belopolsky40018472011-02-26 01:02:56 +00007526PyObject *
7527PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007528{
7529 if (!PyUnicode_Check(unicode)) {
7530 PyErr_BadArgument();
7531 return NULL;
7532 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007533 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007534}
7535
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007536#undef NEED_RETRY
7537
Victor Stinner99b95382011-07-04 14:23:54 +02007538#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007539
Guido van Rossumd57fd912000-03-10 22:53:23 +00007540/* --- Character Mapping Codec -------------------------------------------- */
7541
Alexander Belopolsky40018472011-02-26 01:02:56 +00007542PyObject *
7543PyUnicode_DecodeCharmap(const char *s,
7544 Py_ssize_t size,
7545 PyObject *mapping,
7546 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007547{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007548 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007549 Py_ssize_t startinpos;
7550 Py_ssize_t endinpos;
7551 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007552 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007553 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007554 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007555 PyObject *errorHandler = NULL;
7556 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007557
Guido van Rossumd57fd912000-03-10 22:53:23 +00007558 /* Default to Latin-1 */
7559 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007560 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007561
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007562 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007563 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007564 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007565 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007566 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007567 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007568 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007569 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007570 Py_ssize_t maplen;
7571 enum PyUnicode_Kind kind;
7572 void *data;
7573 Py_UCS4 x;
7574
7575 if (PyUnicode_READY(mapping) < 0)
7576 return NULL;
7577
7578 maplen = PyUnicode_GET_LENGTH(mapping);
7579 data = PyUnicode_DATA(mapping);
7580 kind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007581 while (s < e) {
7582 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007583
Benjamin Peterson29060642009-01-31 22:14:21 +00007584 if (ch < maplen)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007585 x = PyUnicode_READ(kind, data, ch);
7586 else
7587 x = 0xfffe; /* invalid value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007588
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007589 if (x == 0xfffe)
7590 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007591 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007592 startinpos = s-starts;
7593 endinpos = startinpos+1;
7594 if (unicode_decode_call_errorhandler(
7595 errors, &errorHandler,
7596 "charmap", "character maps to <undefined>",
7597 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007598 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007599 goto onError;
7600 }
7601 continue;
7602 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007603
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007604 if (unicode_putchar(&v, &outpos, x) < 0)
7605 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007606 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007607 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007608 }
7609 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007610 while (s < e) {
7611 unsigned char ch = *s;
7612 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007613
Benjamin Peterson29060642009-01-31 22:14:21 +00007614 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7615 w = PyLong_FromLong((long)ch);
7616 if (w == NULL)
7617 goto onError;
7618 x = PyObject_GetItem(mapping, w);
7619 Py_DECREF(w);
7620 if (x == NULL) {
7621 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7622 /* No mapping found means: mapping is undefined. */
7623 PyErr_Clear();
7624 x = Py_None;
7625 Py_INCREF(x);
7626 } else
7627 goto onError;
7628 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007629
Benjamin Peterson29060642009-01-31 22:14:21 +00007630 /* Apply mapping */
7631 if (PyLong_Check(x)) {
7632 long value = PyLong_AS_LONG(x);
7633 if (value < 0 || value > 65535) {
7634 PyErr_SetString(PyExc_TypeError,
7635 "character mapping must be in range(65536)");
7636 Py_DECREF(x);
7637 goto onError;
7638 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007639 if (unicode_putchar(&v, &outpos, value) < 0)
7640 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007641 }
7642 else if (x == Py_None) {
7643 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007644 startinpos = s-starts;
7645 endinpos = startinpos+1;
7646 if (unicode_decode_call_errorhandler(
7647 errors, &errorHandler,
7648 "charmap", "character maps to <undefined>",
7649 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007650 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007651 Py_DECREF(x);
7652 goto onError;
7653 }
7654 Py_DECREF(x);
7655 continue;
7656 }
7657 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007658 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007659
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007660 if (PyUnicode_READY(x) < 0)
7661 goto onError;
7662 targetsize = PyUnicode_GET_LENGTH(x);
7663
7664 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007665 /* 1-1 mapping */
Victor Stinner62aa4d02011-11-09 00:03:45 +01007666 if (unicode_putchar(&v, &outpos,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007667 PyUnicode_READ_CHAR(x, 0)) < 0)
7668 goto onError;
7669 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007670 else if (targetsize > 1) {
7671 /* 1-n mapping */
7672 if (targetsize > extrachars) {
7673 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007674 Py_ssize_t needed = (targetsize - extrachars) + \
7675 (targetsize << 2);
7676 extrachars += needed;
7677 /* XXX overflow detection missing */
Victor Stinner7931d9a2011-11-04 00:22:48 +01007678 if (PyUnicode_Resize(&v,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007679 PyUnicode_GET_LENGTH(v) + needed) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007680 Py_DECREF(x);
7681 goto onError;
7682 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007683 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007684 if (unicode_widen(&v, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
7685 goto onError;
7686 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7687 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00007688 extrachars -= targetsize;
7689 }
7690 /* 1-0 mapping: skip the character */
7691 }
7692 else {
7693 /* wrong return value */
7694 PyErr_SetString(PyExc_TypeError,
7695 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007696 Py_DECREF(x);
7697 goto onError;
7698 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007699 Py_DECREF(x);
7700 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007701 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007702 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007703 if (PyUnicode_Resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01007704 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007705 Py_XDECREF(errorHandler);
7706 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007707 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00007708
Benjamin Peterson29060642009-01-31 22:14:21 +00007709 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007710 Py_XDECREF(errorHandler);
7711 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007712 Py_XDECREF(v);
7713 return NULL;
7714}
7715
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007716/* Charmap encoding: the lookup table */
7717
Alexander Belopolsky40018472011-02-26 01:02:56 +00007718struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007719 PyObject_HEAD
7720 unsigned char level1[32];
7721 int count2, count3;
7722 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007723};
7724
7725static PyObject*
7726encoding_map_size(PyObject *obj, PyObject* args)
7727{
7728 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007729 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007730 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007731}
7732
7733static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007734 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007735 PyDoc_STR("Return the size (in bytes) of this object") },
7736 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007737};
7738
7739static void
7740encoding_map_dealloc(PyObject* o)
7741{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007742 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007743}
7744
7745static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007746 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007747 "EncodingMap", /*tp_name*/
7748 sizeof(struct encoding_map), /*tp_basicsize*/
7749 0, /*tp_itemsize*/
7750 /* methods */
7751 encoding_map_dealloc, /*tp_dealloc*/
7752 0, /*tp_print*/
7753 0, /*tp_getattr*/
7754 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007755 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007756 0, /*tp_repr*/
7757 0, /*tp_as_number*/
7758 0, /*tp_as_sequence*/
7759 0, /*tp_as_mapping*/
7760 0, /*tp_hash*/
7761 0, /*tp_call*/
7762 0, /*tp_str*/
7763 0, /*tp_getattro*/
7764 0, /*tp_setattro*/
7765 0, /*tp_as_buffer*/
7766 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7767 0, /*tp_doc*/
7768 0, /*tp_traverse*/
7769 0, /*tp_clear*/
7770 0, /*tp_richcompare*/
7771 0, /*tp_weaklistoffset*/
7772 0, /*tp_iter*/
7773 0, /*tp_iternext*/
7774 encoding_map_methods, /*tp_methods*/
7775 0, /*tp_members*/
7776 0, /*tp_getset*/
7777 0, /*tp_base*/
7778 0, /*tp_dict*/
7779 0, /*tp_descr_get*/
7780 0, /*tp_descr_set*/
7781 0, /*tp_dictoffset*/
7782 0, /*tp_init*/
7783 0, /*tp_alloc*/
7784 0, /*tp_new*/
7785 0, /*tp_free*/
7786 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007787};
7788
7789PyObject*
7790PyUnicode_BuildEncodingMap(PyObject* string)
7791{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007792 PyObject *result;
7793 struct encoding_map *mresult;
7794 int i;
7795 int need_dict = 0;
7796 unsigned char level1[32];
7797 unsigned char level2[512];
7798 unsigned char *mlevel1, *mlevel2, *mlevel3;
7799 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007800 int kind;
7801 void *data;
7802 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007803
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007804 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007805 PyErr_BadArgument();
7806 return NULL;
7807 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007808 kind = PyUnicode_KIND(string);
7809 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007810 memset(level1, 0xFF, sizeof level1);
7811 memset(level2, 0xFF, sizeof level2);
7812
7813 /* If there isn't a one-to-one mapping of NULL to \0,
7814 or if there are non-BMP characters, we need to use
7815 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007816 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007817 need_dict = 1;
7818 for (i = 1; i < 256; i++) {
7819 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007820 ch = PyUnicode_READ(kind, data, i);
7821 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007822 need_dict = 1;
7823 break;
7824 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007825 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007826 /* unmapped character */
7827 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007828 l1 = ch >> 11;
7829 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007830 if (level1[l1] == 0xFF)
7831 level1[l1] = count2++;
7832 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007833 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007834 }
7835
7836 if (count2 >= 0xFF || count3 >= 0xFF)
7837 need_dict = 1;
7838
7839 if (need_dict) {
7840 PyObject *result = PyDict_New();
7841 PyObject *key, *value;
7842 if (!result)
7843 return NULL;
7844 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007845 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007846 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007847 if (!key || !value)
7848 goto failed1;
7849 if (PyDict_SetItem(result, key, value) == -1)
7850 goto failed1;
7851 Py_DECREF(key);
7852 Py_DECREF(value);
7853 }
7854 return result;
7855 failed1:
7856 Py_XDECREF(key);
7857 Py_XDECREF(value);
7858 Py_DECREF(result);
7859 return NULL;
7860 }
7861
7862 /* Create a three-level trie */
7863 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7864 16*count2 + 128*count3 - 1);
7865 if (!result)
7866 return PyErr_NoMemory();
7867 PyObject_Init(result, &EncodingMapType);
7868 mresult = (struct encoding_map*)result;
7869 mresult->count2 = count2;
7870 mresult->count3 = count3;
7871 mlevel1 = mresult->level1;
7872 mlevel2 = mresult->level23;
7873 mlevel3 = mresult->level23 + 16*count2;
7874 memcpy(mlevel1, level1, 32);
7875 memset(mlevel2, 0xFF, 16*count2);
7876 memset(mlevel3, 0, 128*count3);
7877 count3 = 0;
7878 for (i = 1; i < 256; i++) {
7879 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007880 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007881 /* unmapped character */
7882 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007883 o1 = PyUnicode_READ(kind, data, i)>>11;
7884 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007885 i2 = 16*mlevel1[o1] + o2;
7886 if (mlevel2[i2] == 0xFF)
7887 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007888 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007889 i3 = 128*mlevel2[i2] + o3;
7890 mlevel3[i3] = i;
7891 }
7892 return result;
7893}
7894
7895static int
Victor Stinner22168992011-11-20 17:09:18 +01007896encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007897{
7898 struct encoding_map *map = (struct encoding_map*)mapping;
7899 int l1 = c>>11;
7900 int l2 = (c>>7) & 0xF;
7901 int l3 = c & 0x7F;
7902 int i;
7903
Victor Stinner22168992011-11-20 17:09:18 +01007904 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007905 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007906 if (c == 0)
7907 return 0;
7908 /* level 1*/
7909 i = map->level1[l1];
7910 if (i == 0xFF) {
7911 return -1;
7912 }
7913 /* level 2*/
7914 i = map->level23[16*i+l2];
7915 if (i == 0xFF) {
7916 return -1;
7917 }
7918 /* level 3 */
7919 i = map->level23[16*map->count2 + 128*i + l3];
7920 if (i == 0) {
7921 return -1;
7922 }
7923 return i;
7924}
7925
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007926/* Lookup the character ch in the mapping. If the character
7927 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007928 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007929static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007930charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007931{
Christian Heimes217cfd12007-12-02 14:31:20 +00007932 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007933 PyObject *x;
7934
7935 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007936 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007937 x = PyObject_GetItem(mapping, w);
7938 Py_DECREF(w);
7939 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007940 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7941 /* No mapping found means: mapping is undefined. */
7942 PyErr_Clear();
7943 x = Py_None;
7944 Py_INCREF(x);
7945 return x;
7946 } else
7947 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007948 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007949 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007950 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007951 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007952 long value = PyLong_AS_LONG(x);
7953 if (value < 0 || value > 255) {
7954 PyErr_SetString(PyExc_TypeError,
7955 "character mapping must be in range(256)");
7956 Py_DECREF(x);
7957 return NULL;
7958 }
7959 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007960 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007961 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007962 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007963 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007964 /* wrong return value */
7965 PyErr_Format(PyExc_TypeError,
7966 "character mapping must return integer, bytes or None, not %.400s",
7967 x->ob_type->tp_name);
7968 Py_DECREF(x);
7969 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007970 }
7971}
7972
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007973static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007974charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007975{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007976 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7977 /* exponentially overallocate to minimize reallocations */
7978 if (requiredsize < 2*outsize)
7979 requiredsize = 2*outsize;
7980 if (_PyBytes_Resize(outobj, requiredsize))
7981 return -1;
7982 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007983}
7984
Benjamin Peterson14339b62009-01-31 16:36:08 +00007985typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007986 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007987} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007988/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007989 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007990 space is available. Return a new reference to the object that
7991 was put in the output buffer, or Py_None, if the mapping was undefined
7992 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007993 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007994static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01007995charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007996 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007997{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007998 PyObject *rep;
7999 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008000 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008001
Christian Heimes90aa7642007-12-19 02:45:37 +00008002 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008003 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008004 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008005 if (res == -1)
8006 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008007 if (outsize<requiredsize)
8008 if (charmapencode_resize(outobj, outpos, requiredsize))
8009 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008010 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008011 outstart[(*outpos)++] = (char)res;
8012 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008013 }
8014
8015 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008016 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008017 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008018 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008019 Py_DECREF(rep);
8020 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008021 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008022 if (PyLong_Check(rep)) {
8023 Py_ssize_t requiredsize = *outpos+1;
8024 if (outsize<requiredsize)
8025 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8026 Py_DECREF(rep);
8027 return enc_EXCEPTION;
8028 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008029 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008030 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008031 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008032 else {
8033 const char *repchars = PyBytes_AS_STRING(rep);
8034 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8035 Py_ssize_t requiredsize = *outpos+repsize;
8036 if (outsize<requiredsize)
8037 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8038 Py_DECREF(rep);
8039 return enc_EXCEPTION;
8040 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008041 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008042 memcpy(outstart + *outpos, repchars, repsize);
8043 *outpos += repsize;
8044 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008045 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008046 Py_DECREF(rep);
8047 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008048}
8049
8050/* handle an error in PyUnicode_EncodeCharmap
8051 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008052static int
8053charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008054 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008055 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008056 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008057 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008058{
8059 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008060 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008061 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008062 enum PyUnicode_Kind kind;
8063 void *data;
8064 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008065 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008066 Py_ssize_t collstartpos = *inpos;
8067 Py_ssize_t collendpos = *inpos+1;
8068 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008069 char *encoding = "charmap";
8070 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008071 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008072 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008073 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008074
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008075 if (PyUnicode_READY(unicode) < 0)
8076 return -1;
8077 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008078 /* find all unencodable characters */
8079 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008080 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008081 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008082 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008083 val = encoding_map_lookup(ch, mapping);
8084 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008085 break;
8086 ++collendpos;
8087 continue;
8088 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008089
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008090 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8091 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008092 if (rep==NULL)
8093 return -1;
8094 else if (rep!=Py_None) {
8095 Py_DECREF(rep);
8096 break;
8097 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008098 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008099 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008100 }
8101 /* cache callback name lookup
8102 * (if not done yet, i.e. it's the first error) */
8103 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008104 if ((errors==NULL) || (!strcmp(errors, "strict")))
8105 *known_errorHandler = 1;
8106 else if (!strcmp(errors, "replace"))
8107 *known_errorHandler = 2;
8108 else if (!strcmp(errors, "ignore"))
8109 *known_errorHandler = 3;
8110 else if (!strcmp(errors, "xmlcharrefreplace"))
8111 *known_errorHandler = 4;
8112 else
8113 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008114 }
8115 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008116 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008117 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008118 return -1;
8119 case 2: /* replace */
8120 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008121 x = charmapencode_output('?', mapping, res, respos);
8122 if (x==enc_EXCEPTION) {
8123 return -1;
8124 }
8125 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008126 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008127 return -1;
8128 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008129 }
8130 /* fall through */
8131 case 3: /* ignore */
8132 *inpos = collendpos;
8133 break;
8134 case 4: /* xmlcharrefreplace */
8135 /* generate replacement (temporarily (mis)uses p) */
8136 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008137 char buffer[2+29+1+1];
8138 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008139 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008140 for (cp = buffer; *cp; ++cp) {
8141 x = charmapencode_output(*cp, mapping, res, respos);
8142 if (x==enc_EXCEPTION)
8143 return -1;
8144 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008145 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008146 return -1;
8147 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008148 }
8149 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008150 *inpos = collendpos;
8151 break;
8152 default:
8153 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008154 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008155 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008156 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008157 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008158 if (PyBytes_Check(repunicode)) {
8159 /* Directly copy bytes result to output. */
8160 Py_ssize_t outsize = PyBytes_Size(*res);
8161 Py_ssize_t requiredsize;
8162 repsize = PyBytes_Size(repunicode);
8163 requiredsize = *respos + repsize;
8164 if (requiredsize > outsize)
8165 /* Make room for all additional bytes. */
8166 if (charmapencode_resize(res, respos, requiredsize)) {
8167 Py_DECREF(repunicode);
8168 return -1;
8169 }
8170 memcpy(PyBytes_AsString(*res) + *respos,
8171 PyBytes_AsString(repunicode), repsize);
8172 *respos += repsize;
8173 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008174 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008175 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008176 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008177 /* generate replacement */
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008178 if (PyUnicode_READY(repunicode) < 0) {
8179 Py_DECREF(repunicode);
8180 return -1;
8181 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008182 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008183 data = PyUnicode_DATA(repunicode);
8184 kind = PyUnicode_KIND(repunicode);
8185 for (index = 0; index < repsize; index++) {
8186 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8187 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008188 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008189 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008190 return -1;
8191 }
8192 else if (x==enc_FAILED) {
8193 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008194 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008195 return -1;
8196 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008197 }
8198 *inpos = newpos;
8199 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008200 }
8201 return 0;
8202}
8203
Alexander Belopolsky40018472011-02-26 01:02:56 +00008204PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008205_PyUnicode_EncodeCharmap(PyObject *unicode,
8206 PyObject *mapping,
8207 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008208{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008209 /* output object */
8210 PyObject *res = NULL;
8211 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008212 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008213 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008214 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008215 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008216 PyObject *errorHandler = NULL;
8217 PyObject *exc = NULL;
8218 /* the following variable is used for caching string comparisons
8219 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8220 * 3=ignore, 4=xmlcharrefreplace */
8221 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008222
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008223 if (PyUnicode_READY(unicode) < 0)
8224 return NULL;
8225 size = PyUnicode_GET_LENGTH(unicode);
8226
Guido van Rossumd57fd912000-03-10 22:53:23 +00008227 /* Default to Latin-1 */
8228 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008229 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008230
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008231 /* allocate enough for a simple encoding without
8232 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008233 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008234 if (res == NULL)
8235 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008236 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008237 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008238
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008239 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008240 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008241 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008242 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008243 if (x==enc_EXCEPTION) /* error */
8244 goto onError;
8245 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008246 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008247 &exc,
8248 &known_errorHandler, &errorHandler, errors,
8249 &res, &respos)) {
8250 goto onError;
8251 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008252 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008253 else
8254 /* done with this character => adjust input position */
8255 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008256 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008257
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008258 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008259 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008260 if (_PyBytes_Resize(&res, respos) < 0)
8261 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008262
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008263 Py_XDECREF(exc);
8264 Py_XDECREF(errorHandler);
8265 return res;
8266
Benjamin Peterson29060642009-01-31 22:14:21 +00008267 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008268 Py_XDECREF(res);
8269 Py_XDECREF(exc);
8270 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008271 return NULL;
8272}
8273
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008274/* Deprecated */
8275PyObject *
8276PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8277 Py_ssize_t size,
8278 PyObject *mapping,
8279 const char *errors)
8280{
8281 PyObject *result;
8282 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8283 if (unicode == NULL)
8284 return NULL;
8285 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8286 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008287 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008288}
8289
Alexander Belopolsky40018472011-02-26 01:02:56 +00008290PyObject *
8291PyUnicode_AsCharmapString(PyObject *unicode,
8292 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008293{
8294 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008295 PyErr_BadArgument();
8296 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008297 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008298 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008299}
8300
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008301/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008302static void
8303make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008304 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008305 Py_ssize_t startpos, Py_ssize_t endpos,
8306 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008307{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008308 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008309 *exceptionObject = _PyUnicodeTranslateError_Create(
8310 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008311 }
8312 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008313 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8314 goto onError;
8315 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8316 goto onError;
8317 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8318 goto onError;
8319 return;
8320 onError:
8321 Py_DECREF(*exceptionObject);
8322 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008323 }
8324}
8325
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008326/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008327static void
8328raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008329 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008330 Py_ssize_t startpos, Py_ssize_t endpos,
8331 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008332{
8333 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008334 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008335 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008336 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008337}
8338
8339/* error handling callback helper:
8340 build arguments, call the callback and check the arguments,
8341 put the result into newpos and return the replacement string, which
8342 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008343static PyObject *
8344unicode_translate_call_errorhandler(const char *errors,
8345 PyObject **errorHandler,
8346 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008347 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008348 Py_ssize_t startpos, Py_ssize_t endpos,
8349 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008350{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008351 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008352
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008353 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008354 PyObject *restuple;
8355 PyObject *resunicode;
8356
8357 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008358 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008359 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008360 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008361 }
8362
8363 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008364 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008365 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008366 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008367
8368 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008369 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008370 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008371 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008372 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008373 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008374 Py_DECREF(restuple);
8375 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008376 }
8377 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008378 &resunicode, &i_newpos)) {
8379 Py_DECREF(restuple);
8380 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008381 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008382 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008383 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008384 else
8385 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008386 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008387 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8388 Py_DECREF(restuple);
8389 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008390 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008391 Py_INCREF(resunicode);
8392 Py_DECREF(restuple);
8393 return resunicode;
8394}
8395
8396/* Lookup the character ch in the mapping and put the result in result,
8397 which must be decrefed by the caller.
8398 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008399static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008400charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008401{
Christian Heimes217cfd12007-12-02 14:31:20 +00008402 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008403 PyObject *x;
8404
8405 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008406 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008407 x = PyObject_GetItem(mapping, w);
8408 Py_DECREF(w);
8409 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008410 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8411 /* No mapping found means: use 1:1 mapping. */
8412 PyErr_Clear();
8413 *result = NULL;
8414 return 0;
8415 } else
8416 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008417 }
8418 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008419 *result = x;
8420 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008421 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008422 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008423 long value = PyLong_AS_LONG(x);
8424 long max = PyUnicode_GetMax();
8425 if (value < 0 || value > max) {
8426 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008427 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008428 Py_DECREF(x);
8429 return -1;
8430 }
8431 *result = x;
8432 return 0;
8433 }
8434 else if (PyUnicode_Check(x)) {
8435 *result = x;
8436 return 0;
8437 }
8438 else {
8439 /* wrong return value */
8440 PyErr_SetString(PyExc_TypeError,
8441 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008442 Py_DECREF(x);
8443 return -1;
8444 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008445}
8446/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008447 if not reallocate and adjust various state variables.
8448 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008449static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008450charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008451 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008452{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008453 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00008454 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008455 /* exponentially overallocate to minimize reallocations */
8456 if (requiredsize < 2 * oldsize)
8457 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008458 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8459 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008460 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008461 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008462 }
8463 return 0;
8464}
8465/* lookup the character, put the result in the output string and adjust
8466 various state variables. Return a new reference to the object that
8467 was put in the output buffer in *result, or Py_None, if the mapping was
8468 undefined (in which case no character was written).
8469 The called must decref result.
8470 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008471static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008472charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8473 PyObject *mapping, Py_UCS4 **output,
8474 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008475 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008476{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008477 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8478 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008479 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008480 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008481 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008482 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008483 }
8484 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008485 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008486 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008487 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008488 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008489 }
8490 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008491 Py_ssize_t repsize;
8492 if (PyUnicode_READY(*res) == -1)
8493 return -1;
8494 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008495 if (repsize==1) {
8496 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008497 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008498 }
8499 else if (repsize!=0) {
8500 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008501 Py_ssize_t requiredsize = *opos +
8502 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008503 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008504 Py_ssize_t i;
8505 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008506 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008507 for(i = 0; i < repsize; i++)
8508 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008509 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008510 }
8511 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008512 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008513 return 0;
8514}
8515
Alexander Belopolsky40018472011-02-26 01:02:56 +00008516PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008517_PyUnicode_TranslateCharmap(PyObject *input,
8518 PyObject *mapping,
8519 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008520{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008521 /* input object */
8522 char *idata;
8523 Py_ssize_t size, i;
8524 int kind;
8525 /* output buffer */
8526 Py_UCS4 *output = NULL;
8527 Py_ssize_t osize;
8528 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008529 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008530 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008531 char *reason = "character maps to <undefined>";
8532 PyObject *errorHandler = NULL;
8533 PyObject *exc = NULL;
8534 /* the following variable is used for caching string comparisons
8535 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8536 * 3=ignore, 4=xmlcharrefreplace */
8537 int known_errorHandler = -1;
8538
Guido van Rossumd57fd912000-03-10 22:53:23 +00008539 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008540 PyErr_BadArgument();
8541 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008542 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008543
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008544 if (PyUnicode_READY(input) == -1)
8545 return NULL;
8546 idata = (char*)PyUnicode_DATA(input);
8547 kind = PyUnicode_KIND(input);
8548 size = PyUnicode_GET_LENGTH(input);
8549 i = 0;
8550
8551 if (size == 0) {
8552 Py_INCREF(input);
8553 return input;
8554 }
8555
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008556 /* allocate enough for a simple 1:1 translation without
8557 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008558 osize = size;
8559 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8560 opos = 0;
8561 if (output == NULL) {
8562 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008563 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008564 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008565
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008566 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008567 /* try to encode it */
8568 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008569 if (charmaptranslate_output(input, i, mapping,
8570 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008571 Py_XDECREF(x);
8572 goto onError;
8573 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008574 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008575 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008576 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008577 else { /* untranslatable character */
8578 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8579 Py_ssize_t repsize;
8580 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008581 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008582 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008583 Py_ssize_t collstart = i;
8584 Py_ssize_t collend = i+1;
8585 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008586
Benjamin Peterson29060642009-01-31 22:14:21 +00008587 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008588 while (collend < size) {
8589 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008590 goto onError;
8591 Py_XDECREF(x);
8592 if (x!=Py_None)
8593 break;
8594 ++collend;
8595 }
8596 /* cache callback name lookup
8597 * (if not done yet, i.e. it's the first error) */
8598 if (known_errorHandler==-1) {
8599 if ((errors==NULL) || (!strcmp(errors, "strict")))
8600 known_errorHandler = 1;
8601 else if (!strcmp(errors, "replace"))
8602 known_errorHandler = 2;
8603 else if (!strcmp(errors, "ignore"))
8604 known_errorHandler = 3;
8605 else if (!strcmp(errors, "xmlcharrefreplace"))
8606 known_errorHandler = 4;
8607 else
8608 known_errorHandler = 0;
8609 }
8610 switch (known_errorHandler) {
8611 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008612 raise_translate_exception(&exc, input, collstart,
8613 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008614 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008615 case 2: /* replace */
8616 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008617 for (coll = collstart; coll<collend; coll++)
8618 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008619 /* fall through */
8620 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008621 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008622 break;
8623 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008624 /* generate replacement (temporarily (mis)uses i) */
8625 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008626 char buffer[2+29+1+1];
8627 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008628 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8629 if (charmaptranslate_makespace(&output, &osize,
8630 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008631 goto onError;
8632 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008633 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008634 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008635 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008636 break;
8637 default:
8638 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008639 reason, input, &exc,
8640 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008641 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008642 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008643 if (PyUnicode_READY(repunicode) < 0) {
8644 Py_DECREF(repunicode);
8645 goto onError;
8646 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008647 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008648 repsize = PyUnicode_GET_LENGTH(repunicode);
8649 if (charmaptranslate_makespace(&output, &osize,
8650 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008651 Py_DECREF(repunicode);
8652 goto onError;
8653 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008654 for (uni2 = 0; repsize-->0; ++uni2)
8655 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8656 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008657 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008658 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008659 }
8660 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008661 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8662 if (!res)
8663 goto onError;
8664 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008665 Py_XDECREF(exc);
8666 Py_XDECREF(errorHandler);
8667 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008668
Benjamin Peterson29060642009-01-31 22:14:21 +00008669 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008670 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008671 Py_XDECREF(exc);
8672 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008673 return NULL;
8674}
8675
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008676/* Deprecated. Use PyUnicode_Translate instead. */
8677PyObject *
8678PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8679 Py_ssize_t size,
8680 PyObject *mapping,
8681 const char *errors)
8682{
8683 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8684 if (!unicode)
8685 return NULL;
8686 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8687}
8688
Alexander Belopolsky40018472011-02-26 01:02:56 +00008689PyObject *
8690PyUnicode_Translate(PyObject *str,
8691 PyObject *mapping,
8692 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008693{
8694 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008695
Guido van Rossumd57fd912000-03-10 22:53:23 +00008696 str = PyUnicode_FromObject(str);
8697 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008698 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008699 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008700 Py_DECREF(str);
8701 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008702
Benjamin Peterson29060642009-01-31 22:14:21 +00008703 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008704 Py_XDECREF(str);
8705 return NULL;
8706}
Tim Petersced69f82003-09-16 20:30:58 +00008707
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008708static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008709fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008710{
8711 /* No need to call PyUnicode_READY(self) because this function is only
8712 called as a callback from fixup() which does it already. */
8713 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8714 const int kind = PyUnicode_KIND(self);
8715 void *data = PyUnicode_DATA(self);
8716 Py_UCS4 maxchar = 0, ch, fixed;
8717 Py_ssize_t i;
8718
8719 for (i = 0; i < len; ++i) {
8720 ch = PyUnicode_READ(kind, data, i);
8721 fixed = 0;
8722 if (ch > 127) {
8723 if (Py_UNICODE_ISSPACE(ch))
8724 fixed = ' ';
8725 else {
8726 const int decimal = Py_UNICODE_TODECIMAL(ch);
8727 if (decimal >= 0)
8728 fixed = '0' + decimal;
8729 }
8730 if (fixed != 0) {
8731 if (fixed > maxchar)
8732 maxchar = fixed;
8733 PyUnicode_WRITE(kind, data, i, fixed);
8734 }
8735 else if (ch > maxchar)
8736 maxchar = ch;
8737 }
8738 else if (ch > maxchar)
8739 maxchar = ch;
8740 }
8741
8742 return maxchar;
8743}
8744
8745PyObject *
8746_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8747{
8748 if (!PyUnicode_Check(unicode)) {
8749 PyErr_BadInternalCall();
8750 return NULL;
8751 }
8752 if (PyUnicode_READY(unicode) == -1)
8753 return NULL;
8754 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8755 /* If the string is already ASCII, just return the same string */
8756 Py_INCREF(unicode);
8757 return unicode;
8758 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008759 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008760}
8761
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008762PyObject *
8763PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8764 Py_ssize_t length)
8765{
Victor Stinnerf0124502011-11-21 23:12:56 +01008766 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008767 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008768 Py_UCS4 maxchar;
8769 enum PyUnicode_Kind kind;
8770 void *data;
8771
8772 maxchar = 0;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008773 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008774 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008775 if (ch > 127) {
8776 int decimal = Py_UNICODE_TODECIMAL(ch);
8777 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008778 ch = '0' + decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008779 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008780 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008781 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008782
8783 /* Copy to a new string */
8784 decimal = PyUnicode_New(length, maxchar);
8785 if (decimal == NULL)
8786 return decimal;
8787 kind = PyUnicode_KIND(decimal);
8788 data = PyUnicode_DATA(decimal);
8789 /* Iterate over code points */
8790 for (i = 0; i < length; i++) {
8791 Py_UNICODE ch = s[i];
8792 if (ch > 127) {
8793 int decimal = Py_UNICODE_TODECIMAL(ch);
8794 if (decimal >= 0)
8795 ch = '0' + decimal;
8796 }
8797 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008798 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008799 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008800}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008801/* --- Decimal Encoder ---------------------------------------------------- */
8802
Alexander Belopolsky40018472011-02-26 01:02:56 +00008803int
8804PyUnicode_EncodeDecimal(Py_UNICODE *s,
8805 Py_ssize_t length,
8806 char *output,
8807 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008808{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008809 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008810 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008811 enum PyUnicode_Kind kind;
8812 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008813
8814 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008815 PyErr_BadArgument();
8816 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008817 }
8818
Victor Stinner42bf7752011-11-21 22:52:58 +01008819 unicode = PyUnicode_FromUnicode(s, length);
8820 if (unicode == NULL)
8821 return -1;
8822
Victor Stinner6345be92011-11-25 20:09:01 +01008823 if (PyUnicode_READY(unicode) < 0) {
8824 Py_DECREF(unicode);
8825 return -1;
8826 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008827 kind = PyUnicode_KIND(unicode);
8828 data = PyUnicode_DATA(unicode);
8829
Victor Stinnerb84d7232011-11-22 01:50:07 +01008830 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008831 PyObject *exc;
8832 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008833 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008834 Py_ssize_t startpos;
8835
8836 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008837
Benjamin Peterson29060642009-01-31 22:14:21 +00008838 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008839 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008840 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008841 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008842 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008843 decimal = Py_UNICODE_TODECIMAL(ch);
8844 if (decimal >= 0) {
8845 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008846 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008847 continue;
8848 }
8849 if (0 < ch && ch < 256) {
8850 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008851 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008852 continue;
8853 }
Victor Stinner6345be92011-11-25 20:09:01 +01008854
Victor Stinner42bf7752011-11-21 22:52:58 +01008855 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008856 exc = NULL;
8857 raise_encode_exception(&exc, "decimal", unicode,
8858 startpos, startpos+1,
8859 "invalid decimal Unicode string");
8860 Py_XDECREF(exc);
8861 Py_DECREF(unicode);
8862 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008863 }
8864 /* 0-terminate the output string */
8865 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008866 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008867 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008868}
8869
Guido van Rossumd57fd912000-03-10 22:53:23 +00008870/* --- Helpers ------------------------------------------------------------ */
8871
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008872static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008873any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008874 Py_ssize_t start,
8875 Py_ssize_t end)
8876{
8877 int kind1, kind2, kind;
8878 void *buf1, *buf2;
8879 Py_ssize_t len1, len2, result;
8880
8881 kind1 = PyUnicode_KIND(s1);
8882 kind2 = PyUnicode_KIND(s2);
8883 kind = kind1 > kind2 ? kind1 : kind2;
8884 buf1 = PyUnicode_DATA(s1);
8885 buf2 = PyUnicode_DATA(s2);
8886 if (kind1 != kind)
8887 buf1 = _PyUnicode_AsKind(s1, kind);
8888 if (!buf1)
8889 return -2;
8890 if (kind2 != kind)
8891 buf2 = _PyUnicode_AsKind(s2, kind);
8892 if (!buf2) {
8893 if (kind1 != kind) PyMem_Free(buf1);
8894 return -2;
8895 }
8896 len1 = PyUnicode_GET_LENGTH(s1);
8897 len2 = PyUnicode_GET_LENGTH(s2);
8898
Victor Stinner794d5672011-10-10 03:21:36 +02008899 if (direction > 0) {
8900 switch(kind) {
8901 case PyUnicode_1BYTE_KIND:
8902 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8903 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8904 else
8905 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8906 break;
8907 case PyUnicode_2BYTE_KIND:
8908 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8909 break;
8910 case PyUnicode_4BYTE_KIND:
8911 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8912 break;
8913 default:
8914 assert(0); result = -2;
8915 }
8916 }
8917 else {
8918 switch(kind) {
8919 case PyUnicode_1BYTE_KIND:
8920 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8921 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8922 else
8923 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8924 break;
8925 case PyUnicode_2BYTE_KIND:
8926 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8927 break;
8928 case PyUnicode_4BYTE_KIND:
8929 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8930 break;
8931 default:
8932 assert(0); result = -2;
8933 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008934 }
8935
8936 if (kind1 != kind)
8937 PyMem_Free(buf1);
8938 if (kind2 != kind)
8939 PyMem_Free(buf2);
8940
8941 return result;
8942}
8943
8944Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02008945_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008946 Py_ssize_t n_buffer,
8947 void *digits, Py_ssize_t n_digits,
8948 Py_ssize_t min_width,
8949 const char *grouping,
8950 const char *thousands_sep)
8951{
8952 switch(kind) {
8953 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008954 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
8955 return _PyUnicode_ascii_InsertThousandsGrouping(
8956 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8957 min_width, grouping, thousands_sep);
8958 else
8959 return _PyUnicode_ucs1_InsertThousandsGrouping(
8960 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8961 min_width, grouping, thousands_sep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008962 case PyUnicode_2BYTE_KIND:
8963 return _PyUnicode_ucs2_InsertThousandsGrouping(
8964 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8965 min_width, grouping, thousands_sep);
8966 case PyUnicode_4BYTE_KIND:
8967 return _PyUnicode_ucs4_InsertThousandsGrouping(
8968 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
8969 min_width, grouping, thousands_sep);
8970 }
8971 assert(0);
8972 return -1;
8973}
8974
8975
Thomas Wouters477c8d52006-05-27 19:21:47 +00008976/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008977#define ADJUST_INDICES(start, end, len) \
8978 if (end > len) \
8979 end = len; \
8980 else if (end < 0) { \
8981 end += len; \
8982 if (end < 0) \
8983 end = 0; \
8984 } \
8985 if (start < 0) { \
8986 start += len; \
8987 if (start < 0) \
8988 start = 0; \
8989 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008990
Alexander Belopolsky40018472011-02-26 01:02:56 +00008991Py_ssize_t
8992PyUnicode_Count(PyObject *str,
8993 PyObject *substr,
8994 Py_ssize_t start,
8995 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008996{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008997 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008998 PyObject* str_obj;
8999 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009000 int kind1, kind2, kind;
9001 void *buf1 = NULL, *buf2 = NULL;
9002 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009003
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009004 str_obj = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009005 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009006 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009007 sub_obj = PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02009008 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009009 Py_DECREF(str_obj);
9010 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009011 }
Tim Petersced69f82003-09-16 20:30:58 +00009012
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009013 kind1 = PyUnicode_KIND(str_obj);
9014 kind2 = PyUnicode_KIND(sub_obj);
9015 kind = kind1 > kind2 ? kind1 : kind2;
9016 buf1 = PyUnicode_DATA(str_obj);
9017 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009018 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009019 if (!buf1)
9020 goto onError;
9021 buf2 = PyUnicode_DATA(sub_obj);
9022 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009023 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009024 if (!buf2)
9025 goto onError;
9026 len1 = PyUnicode_GET_LENGTH(str_obj);
9027 len2 = PyUnicode_GET_LENGTH(sub_obj);
9028
9029 ADJUST_INDICES(start, end, len1);
9030 switch(kind) {
9031 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009032 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9033 result = asciilib_count(
9034 ((Py_UCS1*)buf1) + start, end - start,
9035 buf2, len2, PY_SSIZE_T_MAX
9036 );
9037 else
9038 result = ucs1lib_count(
9039 ((Py_UCS1*)buf1) + start, end - start,
9040 buf2, len2, PY_SSIZE_T_MAX
9041 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009042 break;
9043 case PyUnicode_2BYTE_KIND:
9044 result = ucs2lib_count(
9045 ((Py_UCS2*)buf1) + start, end - start,
9046 buf2, len2, PY_SSIZE_T_MAX
9047 );
9048 break;
9049 case PyUnicode_4BYTE_KIND:
9050 result = ucs4lib_count(
9051 ((Py_UCS4*)buf1) + start, end - start,
9052 buf2, len2, PY_SSIZE_T_MAX
9053 );
9054 break;
9055 default:
9056 assert(0); result = 0;
9057 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009058
9059 Py_DECREF(sub_obj);
9060 Py_DECREF(str_obj);
9061
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009062 if (kind1 != kind)
9063 PyMem_Free(buf1);
9064 if (kind2 != kind)
9065 PyMem_Free(buf2);
9066
Guido van Rossumd57fd912000-03-10 22:53:23 +00009067 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009068 onError:
9069 Py_DECREF(sub_obj);
9070 Py_DECREF(str_obj);
9071 if (kind1 != kind && buf1)
9072 PyMem_Free(buf1);
9073 if (kind2 != kind && buf2)
9074 PyMem_Free(buf2);
9075 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009076}
9077
Alexander Belopolsky40018472011-02-26 01:02:56 +00009078Py_ssize_t
9079PyUnicode_Find(PyObject *str,
9080 PyObject *sub,
9081 Py_ssize_t start,
9082 Py_ssize_t end,
9083 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009084{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009085 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009086
Guido van Rossumd57fd912000-03-10 22:53:23 +00009087 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009088 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009089 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009090 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009091 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009092 Py_DECREF(str);
9093 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009094 }
Tim Petersced69f82003-09-16 20:30:58 +00009095
Victor Stinner794d5672011-10-10 03:21:36 +02009096 result = any_find_slice(direction,
9097 str, sub, start, end
9098 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009099
Guido van Rossumd57fd912000-03-10 22:53:23 +00009100 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009101 Py_DECREF(sub);
9102
Guido van Rossumd57fd912000-03-10 22:53:23 +00009103 return result;
9104}
9105
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009106Py_ssize_t
9107PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9108 Py_ssize_t start, Py_ssize_t end,
9109 int direction)
9110{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009111 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009112 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009113 if (PyUnicode_READY(str) == -1)
9114 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009115 if (start < 0 || end < 0) {
9116 PyErr_SetString(PyExc_IndexError, "string index out of range");
9117 return -2;
9118 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009119 if (end > PyUnicode_GET_LENGTH(str))
9120 end = PyUnicode_GET_LENGTH(str);
9121 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009122 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9123 kind, end-start, ch, direction);
9124 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009125 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009126 else
9127 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009128}
9129
Alexander Belopolsky40018472011-02-26 01:02:56 +00009130static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009131tailmatch(PyObject *self,
9132 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009133 Py_ssize_t start,
9134 Py_ssize_t end,
9135 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009136{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009137 int kind_self;
9138 int kind_sub;
9139 void *data_self;
9140 void *data_sub;
9141 Py_ssize_t offset;
9142 Py_ssize_t i;
9143 Py_ssize_t end_sub;
9144
9145 if (PyUnicode_READY(self) == -1 ||
9146 PyUnicode_READY(substring) == -1)
9147 return 0;
9148
9149 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009150 return 1;
9151
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009152 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9153 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009154 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009155 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009156
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009157 kind_self = PyUnicode_KIND(self);
9158 data_self = PyUnicode_DATA(self);
9159 kind_sub = PyUnicode_KIND(substring);
9160 data_sub = PyUnicode_DATA(substring);
9161 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9162
9163 if (direction > 0)
9164 offset = end;
9165 else
9166 offset = start;
9167
9168 if (PyUnicode_READ(kind_self, data_self, offset) ==
9169 PyUnicode_READ(kind_sub, data_sub, 0) &&
9170 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9171 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9172 /* If both are of the same kind, memcmp is sufficient */
9173 if (kind_self == kind_sub) {
9174 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009175 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009176 data_sub,
9177 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009178 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009179 }
9180 /* otherwise we have to compare each character by first accesing it */
9181 else {
9182 /* We do not need to compare 0 and len(substring)-1 because
9183 the if statement above ensured already that they are equal
9184 when we end up here. */
9185 // TODO: honor direction and do a forward or backwards search
9186 for (i = 1; i < end_sub; ++i) {
9187 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9188 PyUnicode_READ(kind_sub, data_sub, i))
9189 return 0;
9190 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009191 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009192 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009193 }
9194
9195 return 0;
9196}
9197
Alexander Belopolsky40018472011-02-26 01:02:56 +00009198Py_ssize_t
9199PyUnicode_Tailmatch(PyObject *str,
9200 PyObject *substr,
9201 Py_ssize_t start,
9202 Py_ssize_t end,
9203 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009204{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009205 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009206
Guido van Rossumd57fd912000-03-10 22:53:23 +00009207 str = PyUnicode_FromObject(str);
9208 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009209 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009210 substr = PyUnicode_FromObject(substr);
9211 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009212 Py_DECREF(str);
9213 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009214 }
Tim Petersced69f82003-09-16 20:30:58 +00009215
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009216 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009217 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009218 Py_DECREF(str);
9219 Py_DECREF(substr);
9220 return result;
9221}
9222
Guido van Rossumd57fd912000-03-10 22:53:23 +00009223/* Apply fixfct filter to the Unicode object self and return a
9224 reference to the modified object */
9225
Alexander Belopolsky40018472011-02-26 01:02:56 +00009226static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009227fixup(PyObject *self,
9228 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009229{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009230 PyObject *u;
9231 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009232 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009233
Victor Stinner87af4f22011-11-21 23:03:47 +01009234 u = PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009235 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009236 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009237 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009238
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009239 /* fix functions return the new maximum character in a string,
9240 if the kind of the resulting unicode object does not change,
9241 everything is fine. Otherwise we need to change the string kind
9242 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009243 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009244
9245 if (maxchar_new == 0) {
9246 /* no changes */;
9247 if (PyUnicode_CheckExact(self)) {
9248 Py_DECREF(u);
9249 Py_INCREF(self);
9250 return self;
9251 }
9252 else
9253 return u;
9254 }
9255
9256 if (maxchar_new <= 127)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009257 maxchar_new = 127;
9258 else if (maxchar_new <= 255)
9259 maxchar_new = 255;
9260 else if (maxchar_new <= 65535)
9261 maxchar_new = 65535;
9262 else
Victor Stinner8faf8212011-12-08 22:14:11 +01009263 maxchar_new = MAX_UNICODE;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009264
Victor Stinnereaab6042011-12-11 22:22:39 +01009265 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009266 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009267
9268 /* In case the maximum character changed, we need to
9269 convert the string to the new category. */
9270 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9271 if (v == NULL) {
9272 Py_DECREF(u);
9273 return NULL;
9274 }
9275 if (maxchar_new > maxchar_old) {
9276 /* If the maxchar increased so that the kind changed, not all
9277 characters are representable anymore and we need to fix the
9278 string again. This only happens in very few cases. */
9279 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
9280 maxchar_old = fixfct(v);
9281 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009282 }
9283 else {
Victor Stinnereaab6042011-12-11 22:22:39 +01009284 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009285 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009286 Py_DECREF(u);
9287 assert(_PyUnicode_CheckConsistency(v, 1));
9288 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009289}
9290
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009291static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009292fixupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009293{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009294 /* No need to call PyUnicode_READY(self) because this function is only
9295 called as a callback from fixup() which does it already. */
9296 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9297 const int kind = PyUnicode_KIND(self);
9298 void *data = PyUnicode_DATA(self);
9299 int touched = 0;
9300 Py_UCS4 maxchar = 0;
9301 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009302
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009303 for (i = 0; i < len; ++i) {
9304 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9305 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
9306 if (up != ch) {
9307 if (up > maxchar)
9308 maxchar = up;
9309 PyUnicode_WRITE(kind, data, i, up);
9310 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009311 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009312 else if (ch > maxchar)
9313 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009314 }
9315
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009316 if (touched)
9317 return maxchar;
9318 else
9319 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009320}
9321
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009322static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009323fixlower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009324{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009325 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9326 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9327 const int kind = PyUnicode_KIND(self);
9328 void *data = PyUnicode_DATA(self);
9329 int touched = 0;
9330 Py_UCS4 maxchar = 0;
9331 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009332
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009333 for(i = 0; i < len; ++i) {
9334 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9335 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9336 if (lo != ch) {
9337 if (lo > maxchar)
9338 maxchar = lo;
9339 PyUnicode_WRITE(kind, data, i, lo);
9340 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009341 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009342 else if (ch > maxchar)
9343 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009344 }
9345
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009346 if (touched)
9347 return maxchar;
9348 else
9349 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009350}
9351
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009352static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009353fixswapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009354{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009355 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9356 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9357 const int kind = PyUnicode_KIND(self);
9358 void *data = PyUnicode_DATA(self);
9359 int touched = 0;
9360 Py_UCS4 maxchar = 0;
9361 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009362
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009363 for(i = 0; i < len; ++i) {
9364 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9365 Py_UCS4 nu = 0;
9366
9367 if (Py_UNICODE_ISUPPER(ch))
9368 nu = Py_UNICODE_TOLOWER(ch);
9369 else if (Py_UNICODE_ISLOWER(ch))
9370 nu = Py_UNICODE_TOUPPER(ch);
9371
9372 if (nu != 0) {
9373 if (nu > maxchar)
9374 maxchar = nu;
9375 PyUnicode_WRITE(kind, data, i, nu);
9376 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009377 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009378 else if (ch > maxchar)
9379 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009380 }
9381
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009382 if (touched)
9383 return maxchar;
9384 else
9385 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009386}
9387
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009388static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009389fixcapitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009390{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009391 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9392 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9393 const int kind = PyUnicode_KIND(self);
9394 void *data = PyUnicode_DATA(self);
9395 int touched = 0;
9396 Py_UCS4 maxchar = 0;
9397 Py_ssize_t i = 0;
9398 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00009399
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009400 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009401 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009402
9403 ch = PyUnicode_READ(kind, data, i);
9404 if (!Py_UNICODE_ISUPPER(ch)) {
9405 maxchar = Py_UNICODE_TOUPPER(ch);
9406 PyUnicode_WRITE(kind, data, i, maxchar);
9407 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009408 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009409 ++i;
9410 for(; i < len; ++i) {
9411 ch = PyUnicode_READ(kind, data, i);
9412 if (!Py_UNICODE_ISLOWER(ch)) {
9413 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9414 if (lo > maxchar)
9415 maxchar = lo;
9416 PyUnicode_WRITE(kind, data, i, lo);
9417 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009418 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009419 else if (ch > maxchar)
9420 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009421 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009422
9423 if (touched)
9424 return maxchar;
9425 else
9426 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009427}
9428
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009429static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009430fixtitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009431{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009432 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9433 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9434 const int kind = PyUnicode_KIND(self);
9435 void *data = PyUnicode_DATA(self);
9436 Py_UCS4 maxchar = 0;
9437 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009438 int previous_is_cased;
9439
9440 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009441 if (len == 1) {
9442 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9443 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
9444 if (ti != ch) {
9445 PyUnicode_WRITE(kind, data, i, ti);
9446 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00009447 }
9448 else
9449 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009450 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009451 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009452 for(; i < len; ++i) {
9453 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9454 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00009455
Benjamin Peterson29060642009-01-31 22:14:21 +00009456 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009457 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00009458 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009459 nu = Py_UNICODE_TOTITLE(ch);
9460
9461 if (nu > maxchar)
9462 maxchar = nu;
9463 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00009464
Benjamin Peterson29060642009-01-31 22:14:21 +00009465 if (Py_UNICODE_ISLOWER(ch) ||
9466 Py_UNICODE_ISUPPER(ch) ||
9467 Py_UNICODE_ISTITLE(ch))
9468 previous_is_cased = 1;
9469 else
9470 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009471 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009472 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009473}
9474
Tim Peters8ce9f162004-08-27 01:49:32 +00009475PyObject *
9476PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009477{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009478 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009479 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009480 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009481 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009482 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9483 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009484 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009485 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009486 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009487 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009488 int use_memcpy;
9489 unsigned char *res_data = NULL, *sep_data = NULL;
9490 PyObject *last_obj;
9491 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009492
Tim Peters05eba1f2004-08-27 21:32:02 +00009493 fseq = PySequence_Fast(seq, "");
9494 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009495 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009496 }
9497
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009498 /* NOTE: the following code can't call back into Python code,
9499 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009500 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009501
Tim Peters05eba1f2004-08-27 21:32:02 +00009502 seqlen = PySequence_Fast_GET_SIZE(fseq);
9503 /* If empty sequence, return u"". */
9504 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009505 Py_DECREF(fseq);
9506 Py_INCREF(unicode_empty);
9507 res = unicode_empty;
9508 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009509 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009510
Tim Peters05eba1f2004-08-27 21:32:02 +00009511 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009512 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009513 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009514 if (seqlen == 1) {
9515 if (PyUnicode_CheckExact(items[0])) {
9516 res = items[0];
9517 Py_INCREF(res);
9518 Py_DECREF(fseq);
9519 return res;
9520 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009521 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009522 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009523 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009524 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009525 /* Set up sep and seplen */
9526 if (separator == NULL) {
9527 /* fall back to a blank space separator */
9528 sep = PyUnicode_FromOrdinal(' ');
9529 if (!sep)
9530 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009531 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009532 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009533 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009534 else {
9535 if (!PyUnicode_Check(separator)) {
9536 PyErr_Format(PyExc_TypeError,
9537 "separator: expected str instance,"
9538 " %.80s found",
9539 Py_TYPE(separator)->tp_name);
9540 goto onError;
9541 }
9542 if (PyUnicode_READY(separator))
9543 goto onError;
9544 sep = separator;
9545 seplen = PyUnicode_GET_LENGTH(separator);
9546 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9547 /* inc refcount to keep this code path symmetric with the
9548 above case of a blank separator */
9549 Py_INCREF(sep);
9550 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009551 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009552 }
9553
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009554 /* There are at least two things to join, or else we have a subclass
9555 * of str in the sequence.
9556 * Do a pre-pass to figure out the total amount of space we'll
9557 * need (sz), and see whether all argument are strings.
9558 */
9559 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009560#ifdef Py_DEBUG
9561 use_memcpy = 0;
9562#else
9563 use_memcpy = 1;
9564#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009565 for (i = 0; i < seqlen; i++) {
9566 const Py_ssize_t old_sz = sz;
9567 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009568 if (!PyUnicode_Check(item)) {
9569 PyErr_Format(PyExc_TypeError,
9570 "sequence item %zd: expected str instance,"
9571 " %.80s found",
9572 i, Py_TYPE(item)->tp_name);
9573 goto onError;
9574 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009575 if (PyUnicode_READY(item) == -1)
9576 goto onError;
9577 sz += PyUnicode_GET_LENGTH(item);
9578 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009579 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009580 if (i != 0)
9581 sz += seplen;
9582 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9583 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009584 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009585 goto onError;
9586 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009587 if (use_memcpy && last_obj != NULL) {
9588 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9589 use_memcpy = 0;
9590 }
9591 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009592 }
Tim Petersced69f82003-09-16 20:30:58 +00009593
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009594 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009595 if (res == NULL)
9596 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009597
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009598 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009599#ifdef Py_DEBUG
9600 use_memcpy = 0;
9601#else
9602 if (use_memcpy) {
9603 res_data = PyUnicode_1BYTE_DATA(res);
9604 kind = PyUnicode_KIND(res);
9605 if (seplen != 0)
9606 sep_data = PyUnicode_1BYTE_DATA(sep);
9607 }
9608#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009609 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009610 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009611 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009612 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009613 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009614 if (use_memcpy) {
9615 Py_MEMCPY(res_data,
9616 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009617 kind * seplen);
9618 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009619 }
9620 else {
9621 copy_characters(res, res_offset, sep, 0, seplen);
9622 res_offset += seplen;
9623 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009624 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009625 itemlen = PyUnicode_GET_LENGTH(item);
9626 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009627 if (use_memcpy) {
9628 Py_MEMCPY(res_data,
9629 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009630 kind * itemlen);
9631 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009632 }
9633 else {
9634 copy_characters(res, res_offset, item, 0, itemlen);
9635 res_offset += itemlen;
9636 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009637 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009638 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009639 if (use_memcpy)
9640 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009641 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009642 else
9643 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009644
Tim Peters05eba1f2004-08-27 21:32:02 +00009645 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009646 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009647 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009648 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009649
Benjamin Peterson29060642009-01-31 22:14:21 +00009650 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009651 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009652 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009653 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009654 return NULL;
9655}
9656
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009657#define FILL(kind, data, value, start, length) \
9658 do { \
9659 Py_ssize_t i_ = 0; \
9660 assert(kind != PyUnicode_WCHAR_KIND); \
9661 switch ((kind)) { \
9662 case PyUnicode_1BYTE_KIND: { \
9663 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9664 memset(to_, (unsigned char)value, length); \
9665 break; \
9666 } \
9667 case PyUnicode_2BYTE_KIND: { \
9668 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9669 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9670 break; \
9671 } \
9672 default: { \
9673 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9674 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9675 break; \
9676 } \
9677 } \
9678 } while (0)
9679
Victor Stinner9310abb2011-10-05 00:59:23 +02009680static PyObject *
9681pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009682 Py_ssize_t left,
9683 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009684 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009685{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009686 PyObject *u;
9687 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009688 int kind;
9689 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009690
9691 if (left < 0)
9692 left = 0;
9693 if (right < 0)
9694 right = 0;
9695
Victor Stinnerc4b49542011-12-11 22:44:26 +01009696 if (left == 0 && right == 0)
9697 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009698
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009699 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9700 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009701 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9702 return NULL;
9703 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009704 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9705 if (fill > maxchar)
9706 maxchar = fill;
9707 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009708 if (!u)
9709 return NULL;
9710
9711 kind = PyUnicode_KIND(u);
9712 data = PyUnicode_DATA(u);
9713 if (left)
9714 FILL(kind, data, fill, 0, left);
9715 if (right)
9716 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009717 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009718 assert(_PyUnicode_CheckConsistency(u, 1));
9719 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009720}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009721#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009722
Alexander Belopolsky40018472011-02-26 01:02:56 +00009723PyObject *
9724PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009725{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009726 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009727
9728 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009729 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009730 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009731
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009732 switch(PyUnicode_KIND(string)) {
9733 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009734 if (PyUnicode_IS_ASCII(string))
9735 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009736 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009737 PyUnicode_GET_LENGTH(string), keepends);
9738 else
9739 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009740 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009741 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009742 break;
9743 case PyUnicode_2BYTE_KIND:
9744 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009745 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009746 PyUnicode_GET_LENGTH(string), keepends);
9747 break;
9748 case PyUnicode_4BYTE_KIND:
9749 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009750 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009751 PyUnicode_GET_LENGTH(string), keepends);
9752 break;
9753 default:
9754 assert(0);
9755 list = 0;
9756 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009757 Py_DECREF(string);
9758 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009759}
9760
Alexander Belopolsky40018472011-02-26 01:02:56 +00009761static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009762split(PyObject *self,
9763 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009764 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009765{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009766 int kind1, kind2, kind;
9767 void *buf1, *buf2;
9768 Py_ssize_t len1, len2;
9769 PyObject* out;
9770
Guido van Rossumd57fd912000-03-10 22:53:23 +00009771 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009772 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009773
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009774 if (PyUnicode_READY(self) == -1)
9775 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009776
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009777 if (substring == NULL)
9778 switch(PyUnicode_KIND(self)) {
9779 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009780 if (PyUnicode_IS_ASCII(self))
9781 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009782 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009783 PyUnicode_GET_LENGTH(self), maxcount
9784 );
9785 else
9786 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009787 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009788 PyUnicode_GET_LENGTH(self), maxcount
9789 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009790 case PyUnicode_2BYTE_KIND:
9791 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009792 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009793 PyUnicode_GET_LENGTH(self), maxcount
9794 );
9795 case PyUnicode_4BYTE_KIND:
9796 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009797 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009798 PyUnicode_GET_LENGTH(self), maxcount
9799 );
9800 default:
9801 assert(0);
9802 return NULL;
9803 }
9804
9805 if (PyUnicode_READY(substring) == -1)
9806 return NULL;
9807
9808 kind1 = PyUnicode_KIND(self);
9809 kind2 = PyUnicode_KIND(substring);
9810 kind = kind1 > kind2 ? kind1 : kind2;
9811 buf1 = PyUnicode_DATA(self);
9812 buf2 = PyUnicode_DATA(substring);
9813 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009814 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009815 if (!buf1)
9816 return NULL;
9817 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009818 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009819 if (!buf2) {
9820 if (kind1 != kind) PyMem_Free(buf1);
9821 return NULL;
9822 }
9823 len1 = PyUnicode_GET_LENGTH(self);
9824 len2 = PyUnicode_GET_LENGTH(substring);
9825
9826 switch(kind) {
9827 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009828 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9829 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009830 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009831 else
9832 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009833 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009834 break;
9835 case PyUnicode_2BYTE_KIND:
9836 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009837 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009838 break;
9839 case PyUnicode_4BYTE_KIND:
9840 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009841 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009842 break;
9843 default:
9844 out = NULL;
9845 }
9846 if (kind1 != kind)
9847 PyMem_Free(buf1);
9848 if (kind2 != kind)
9849 PyMem_Free(buf2);
9850 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009851}
9852
Alexander Belopolsky40018472011-02-26 01:02:56 +00009853static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009854rsplit(PyObject *self,
9855 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009856 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009857{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009858 int kind1, kind2, kind;
9859 void *buf1, *buf2;
9860 Py_ssize_t len1, len2;
9861 PyObject* out;
9862
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009863 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009864 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009865
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009866 if (PyUnicode_READY(self) == -1)
9867 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009868
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009869 if (substring == NULL)
9870 switch(PyUnicode_KIND(self)) {
9871 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009872 if (PyUnicode_IS_ASCII(self))
9873 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009874 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009875 PyUnicode_GET_LENGTH(self), maxcount
9876 );
9877 else
9878 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009879 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009880 PyUnicode_GET_LENGTH(self), maxcount
9881 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009882 case PyUnicode_2BYTE_KIND:
9883 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009884 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009885 PyUnicode_GET_LENGTH(self), maxcount
9886 );
9887 case PyUnicode_4BYTE_KIND:
9888 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009889 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009890 PyUnicode_GET_LENGTH(self), maxcount
9891 );
9892 default:
9893 assert(0);
9894 return NULL;
9895 }
9896
9897 if (PyUnicode_READY(substring) == -1)
9898 return NULL;
9899
9900 kind1 = PyUnicode_KIND(self);
9901 kind2 = PyUnicode_KIND(substring);
9902 kind = kind1 > kind2 ? kind1 : kind2;
9903 buf1 = PyUnicode_DATA(self);
9904 buf2 = PyUnicode_DATA(substring);
9905 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009906 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009907 if (!buf1)
9908 return NULL;
9909 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009910 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009911 if (!buf2) {
9912 if (kind1 != kind) PyMem_Free(buf1);
9913 return NULL;
9914 }
9915 len1 = PyUnicode_GET_LENGTH(self);
9916 len2 = PyUnicode_GET_LENGTH(substring);
9917
9918 switch(kind) {
9919 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009920 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9921 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009922 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009923 else
9924 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009925 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009926 break;
9927 case PyUnicode_2BYTE_KIND:
9928 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009929 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009930 break;
9931 case PyUnicode_4BYTE_KIND:
9932 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009933 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009934 break;
9935 default:
9936 out = NULL;
9937 }
9938 if (kind1 != kind)
9939 PyMem_Free(buf1);
9940 if (kind2 != kind)
9941 PyMem_Free(buf2);
9942 return out;
9943}
9944
9945static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009946anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9947 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009948{
9949 switch(kind) {
9950 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009951 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9952 return asciilib_find(buf1, len1, buf2, len2, offset);
9953 else
9954 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009955 case PyUnicode_2BYTE_KIND:
9956 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9957 case PyUnicode_4BYTE_KIND:
9958 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9959 }
9960 assert(0);
9961 return -1;
9962}
9963
9964static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009965anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
9966 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009967{
9968 switch(kind) {
9969 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009970 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
9971 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
9972 else
9973 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009974 case PyUnicode_2BYTE_KIND:
9975 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9976 case PyUnicode_4BYTE_KIND:
9977 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9978 }
9979 assert(0);
9980 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009981}
9982
Alexander Belopolsky40018472011-02-26 01:02:56 +00009983static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009984replace(PyObject *self, PyObject *str1,
9985 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009986{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009987 PyObject *u;
9988 char *sbuf = PyUnicode_DATA(self);
9989 char *buf1 = PyUnicode_DATA(str1);
9990 char *buf2 = PyUnicode_DATA(str2);
9991 int srelease = 0, release1 = 0, release2 = 0;
9992 int skind = PyUnicode_KIND(self);
9993 int kind1 = PyUnicode_KIND(str1);
9994 int kind2 = PyUnicode_KIND(str2);
9995 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9996 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9997 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +02009998 int mayshrink;
9999 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010000
10001 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010002 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010003 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010004 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010005
Victor Stinner59de0ee2011-10-07 10:01:28 +020010006 if (str1 == str2)
10007 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010008 if (skind < kind1)
10009 /* substring too wide to be present */
10010 goto nothing;
10011
Victor Stinner49a0a212011-10-12 23:46:10 +020010012 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10013 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10014 /* Replacing str1 with str2 may cause a maxchar reduction in the
10015 result string. */
10016 mayshrink = (maxchar_str2 < maxchar);
10017 maxchar = Py_MAX(maxchar, maxchar_str2);
10018
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010019 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +000010020 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010021 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010022 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010023 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010024 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010025 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010026 Py_UCS4 u1, u2;
10027 int rkind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010028 u1 = PyUnicode_READ_CHAR(str1, 0);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +020010029 if (findchar(sbuf, PyUnicode_KIND(self),
10030 slen, u1, 1) < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010031 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010032 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010033 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010034 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010035 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010036 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010037 rkind = PyUnicode_KIND(u);
10038 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
10039 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010040 if (--maxcount < 0)
10041 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010042 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010043 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010044 }
10045 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010046 int rkind = skind;
10047 char *res;
Victor Stinner25a4b292011-10-06 12:31:55 +020010048
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010049 if (kind1 < rkind) {
10050 /* widen substring */
10051 buf1 = _PyUnicode_AsKind(str1, rkind);
10052 if (!buf1) goto error;
10053 release1 = 1;
10054 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010055 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010056 if (i < 0)
10057 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010058 if (rkind > kind2) {
10059 /* widen replacement */
10060 buf2 = _PyUnicode_AsKind(str2, rkind);
10061 if (!buf2) goto error;
10062 release2 = 1;
10063 }
10064 else if (rkind < kind2) {
10065 /* widen self and buf1 */
10066 rkind = kind2;
10067 if (release1) PyMem_Free(buf1);
10068 sbuf = _PyUnicode_AsKind(self, rkind);
10069 if (!sbuf) goto error;
10070 srelease = 1;
10071 buf1 = _PyUnicode_AsKind(str1, rkind);
10072 if (!buf1) goto error;
10073 release1 = 1;
10074 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010075 u = PyUnicode_New(slen, maxchar);
10076 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010077 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010078 assert(PyUnicode_KIND(u) == rkind);
10079 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010080
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010081 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010082 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010083 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010084 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010085 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010086 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010087
10088 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010089 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010090 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010091 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010092 if (i == -1)
10093 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010094 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010095 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010096 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010097 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010098 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010099 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010100 }
10101 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010102 Py_ssize_t n, i, j, ires;
10103 Py_ssize_t product, new_size;
10104 int rkind = skind;
10105 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010106
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010107 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010108 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010109 buf1 = _PyUnicode_AsKind(str1, rkind);
10110 if (!buf1) goto error;
10111 release1 = 1;
10112 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010113 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010114 if (n == 0)
10115 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010116 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010117 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010118 buf2 = _PyUnicode_AsKind(str2, rkind);
10119 if (!buf2) goto error;
10120 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010121 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010122 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010123 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010124 rkind = kind2;
10125 sbuf = _PyUnicode_AsKind(self, rkind);
10126 if (!sbuf) goto error;
10127 srelease = 1;
10128 if (release1) PyMem_Free(buf1);
10129 buf1 = _PyUnicode_AsKind(str1, rkind);
10130 if (!buf1) goto error;
10131 release1 = 1;
10132 }
10133 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10134 PyUnicode_GET_LENGTH(str1))); */
10135 product = n * (len2-len1);
10136 if ((product / (len2-len1)) != n) {
10137 PyErr_SetString(PyExc_OverflowError,
10138 "replace string is too long");
10139 goto error;
10140 }
10141 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +020010142 if (new_size == 0) {
10143 Py_INCREF(unicode_empty);
10144 u = unicode_empty;
10145 goto done;
10146 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010147 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10148 PyErr_SetString(PyExc_OverflowError,
10149 "replace string is too long");
10150 goto error;
10151 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010152 u = PyUnicode_New(new_size, maxchar);
10153 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010154 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010155 assert(PyUnicode_KIND(u) == rkind);
10156 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010157 ires = i = 0;
10158 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010159 while (n-- > 0) {
10160 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010161 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010162 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010163 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010164 if (j == -1)
10165 break;
10166 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010167 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010168 memcpy(res + rkind * ires,
10169 sbuf + rkind * i,
10170 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010171 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010172 }
10173 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010174 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010175 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010176 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010177 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010178 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010179 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010180 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010181 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010182 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010183 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010184 memcpy(res + rkind * ires,
10185 sbuf + rkind * i,
10186 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010187 }
10188 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010189 /* interleave */
10190 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010191 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010192 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010193 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010194 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010195 if (--n <= 0)
10196 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010197 memcpy(res + rkind * ires,
10198 sbuf + rkind * i,
10199 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010200 ires++;
10201 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010202 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010203 memcpy(res + rkind * ires,
10204 sbuf + rkind * i,
10205 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010206 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010207 }
10208
10209 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010210 unicode_adjust_maxchar(&u);
10211 if (u == NULL)
10212 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010213 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010214
10215 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010216 if (srelease)
10217 PyMem_FREE(sbuf);
10218 if (release1)
10219 PyMem_FREE(buf1);
10220 if (release2)
10221 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010222 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010223 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010224
Benjamin Peterson29060642009-01-31 22:14:21 +000010225 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010226 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010227 if (srelease)
10228 PyMem_FREE(sbuf);
10229 if (release1)
10230 PyMem_FREE(buf1);
10231 if (release2)
10232 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010233 return unicode_result_unchanged(self);
10234
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010235 error:
10236 if (srelease && sbuf)
10237 PyMem_FREE(sbuf);
10238 if (release1 && buf1)
10239 PyMem_FREE(buf1);
10240 if (release2 && buf2)
10241 PyMem_FREE(buf2);
10242 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010243}
10244
10245/* --- Unicode Object Methods --------------------------------------------- */
10246
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010247PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010248 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010249\n\
10250Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010251characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010252
10253static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010254unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010255{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010256 return fixup(self, fixtitle);
10257}
10258
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010259PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010260 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010261\n\
10262Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010263have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010264
10265static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010266unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010267{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010268 return fixup(self, fixcapitalize);
10269}
10270
10271#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010272PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010273 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010274\n\
10275Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010276normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010277
10278static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010279unicode_capwords(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010280{
10281 PyObject *list;
10282 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010283 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010284
Guido van Rossumd57fd912000-03-10 22:53:23 +000010285 /* Split into words */
10286 list = split(self, NULL, -1);
10287 if (!list)
10288 return NULL;
10289
10290 /* Capitalize each word */
10291 for (i = 0; i < PyList_GET_SIZE(list); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010292 item = fixup(PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +000010293 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010294 if (item == NULL)
10295 goto onError;
10296 Py_DECREF(PyList_GET_ITEM(list, i));
10297 PyList_SET_ITEM(list, i, item);
10298 }
10299
10300 /* Join the words to form a new string */
10301 item = PyUnicode_Join(NULL, list);
10302
Benjamin Peterson29060642009-01-31 22:14:21 +000010303 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010304 Py_DECREF(list);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010305 return item;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010306}
10307#endif
10308
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010309/* Argument converter. Coerces to a single unicode character */
10310
10311static int
10312convert_uc(PyObject *obj, void *addr)
10313{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010314 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010315 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010316
Benjamin Peterson14339b62009-01-31 16:36:08 +000010317 uniobj = PyUnicode_FromObject(obj);
10318 if (uniobj == NULL) {
10319 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010320 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010321 return 0;
10322 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010323 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010324 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010325 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010326 Py_DECREF(uniobj);
10327 return 0;
10328 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010329 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010330 Py_DECREF(uniobj);
10331 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010332}
10333
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010334PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010335 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010336\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010337Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010338done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010339
10340static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010341unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010342{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010343 Py_ssize_t marg, left;
10344 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010345 Py_UCS4 fillchar = ' ';
10346
Victor Stinnere9a29352011-10-01 02:14:59 +020010347 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010348 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010349
Victor Stinnerc4b49542011-12-11 22:44:26 +010010350 if (PyUnicode_READY(self) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010351 return NULL;
10352
Victor Stinnerc4b49542011-12-11 22:44:26 +010010353 if (PyUnicode_GET_LENGTH(self) >= width)
10354 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010355
Victor Stinnerc4b49542011-12-11 22:44:26 +010010356 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010357 left = marg / 2 + (marg & width & 1);
10358
Victor Stinner9310abb2011-10-05 00:59:23 +020010359 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010360}
10361
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010362/* This function assumes that str1 and str2 are readied by the caller. */
10363
Marc-André Lemburge5034372000-08-08 08:04:29 +000010364static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010365unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010366{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010367 int kind1, kind2;
10368 void *data1, *data2;
10369 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010370
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010371 kind1 = PyUnicode_KIND(str1);
10372 kind2 = PyUnicode_KIND(str2);
10373 data1 = PyUnicode_DATA(str1);
10374 data2 = PyUnicode_DATA(str2);
10375 len1 = PyUnicode_GET_LENGTH(str1);
10376 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010377
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010378 for (i = 0; i < len1 && i < len2; ++i) {
10379 Py_UCS4 c1, c2;
10380 c1 = PyUnicode_READ(kind1, data1, i);
10381 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010382
10383 if (c1 != c2)
10384 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010385 }
10386
10387 return (len1 < len2) ? -1 : (len1 != len2);
10388}
10389
Alexander Belopolsky40018472011-02-26 01:02:56 +000010390int
10391PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010392{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010393 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10394 if (PyUnicode_READY(left) == -1 ||
10395 PyUnicode_READY(right) == -1)
10396 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010397 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010398 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010399 PyErr_Format(PyExc_TypeError,
10400 "Can't compare %.100s and %.100s",
10401 left->ob_type->tp_name,
10402 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010403 return -1;
10404}
10405
Martin v. Löwis5b222132007-06-10 09:51:05 +000010406int
10407PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10408{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010409 Py_ssize_t i;
10410 int kind;
10411 void *data;
10412 Py_UCS4 chr;
10413
Victor Stinner910337b2011-10-03 03:20:16 +020010414 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010415 if (PyUnicode_READY(uni) == -1)
10416 return -1;
10417 kind = PyUnicode_KIND(uni);
10418 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010419 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010420 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10421 if (chr != str[i])
10422 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010423 /* This check keeps Python strings that end in '\0' from comparing equal
10424 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010425 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010426 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010427 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010428 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010429 return 0;
10430}
10431
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010432
Benjamin Peterson29060642009-01-31 22:14:21 +000010433#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010434 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010435
Alexander Belopolsky40018472011-02-26 01:02:56 +000010436PyObject *
10437PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010438{
10439 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010440
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010441 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10442 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010443 if (PyUnicode_READY(left) == -1 ||
10444 PyUnicode_READY(right) == -1)
10445 return NULL;
10446 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10447 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010448 if (op == Py_EQ) {
10449 Py_INCREF(Py_False);
10450 return Py_False;
10451 }
10452 if (op == Py_NE) {
10453 Py_INCREF(Py_True);
10454 return Py_True;
10455 }
10456 }
10457 if (left == right)
10458 result = 0;
10459 else
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010460 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010461
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010462 /* Convert the return value to a Boolean */
10463 switch (op) {
10464 case Py_EQ:
10465 v = TEST_COND(result == 0);
10466 break;
10467 case Py_NE:
10468 v = TEST_COND(result != 0);
10469 break;
10470 case Py_LE:
10471 v = TEST_COND(result <= 0);
10472 break;
10473 case Py_GE:
10474 v = TEST_COND(result >= 0);
10475 break;
10476 case Py_LT:
10477 v = TEST_COND(result == -1);
10478 break;
10479 case Py_GT:
10480 v = TEST_COND(result == 1);
10481 break;
10482 default:
10483 PyErr_BadArgument();
10484 return NULL;
10485 }
10486 Py_INCREF(v);
10487 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010488 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010489
Brian Curtindfc80e32011-08-10 20:28:54 -050010490 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010491}
10492
Alexander Belopolsky40018472011-02-26 01:02:56 +000010493int
10494PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010495{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010496 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010497 int kind1, kind2, kind;
10498 void *buf1, *buf2;
10499 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010500 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010501
10502 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010503 sub = PyUnicode_FromObject(element);
10504 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010505 PyErr_Format(PyExc_TypeError,
10506 "'in <string>' requires string as left operand, not %s",
10507 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010508 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010509 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010510 if (PyUnicode_READY(sub) == -1)
10511 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010512
Thomas Wouters477c8d52006-05-27 19:21:47 +000010513 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +020010514 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010515 Py_DECREF(sub);
10516 return -1;
10517 }
10518
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010519 kind1 = PyUnicode_KIND(str);
10520 kind2 = PyUnicode_KIND(sub);
10521 kind = kind1 > kind2 ? kind1 : kind2;
10522 buf1 = PyUnicode_DATA(str);
10523 buf2 = PyUnicode_DATA(sub);
10524 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010525 buf1 = _PyUnicode_AsKind(str, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010526 if (!buf1) {
10527 Py_DECREF(sub);
10528 return -1;
10529 }
10530 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010531 buf2 = _PyUnicode_AsKind(sub, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010532 if (!buf2) {
10533 Py_DECREF(sub);
10534 if (kind1 != kind) PyMem_Free(buf1);
10535 return -1;
10536 }
10537 len1 = PyUnicode_GET_LENGTH(str);
10538 len2 = PyUnicode_GET_LENGTH(sub);
10539
10540 switch(kind) {
10541 case PyUnicode_1BYTE_KIND:
10542 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10543 break;
10544 case PyUnicode_2BYTE_KIND:
10545 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10546 break;
10547 case PyUnicode_4BYTE_KIND:
10548 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10549 break;
10550 default:
10551 result = -1;
10552 assert(0);
10553 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010554
10555 Py_DECREF(str);
10556 Py_DECREF(sub);
10557
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010558 if (kind1 != kind)
10559 PyMem_Free(buf1);
10560 if (kind2 != kind)
10561 PyMem_Free(buf2);
10562
Guido van Rossum403d68b2000-03-13 15:55:09 +000010563 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010564}
10565
Guido van Rossumd57fd912000-03-10 22:53:23 +000010566/* Concat to string or Unicode object giving a new Unicode object. */
10567
Alexander Belopolsky40018472011-02-26 01:02:56 +000010568PyObject *
10569PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010570{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010571 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010572 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010573 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010574
10575 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010576 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010577 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010578 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010579 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010580 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010581 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010582
10583 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010584 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010585 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010586 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010587 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010588 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010589 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010590 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010591 }
10592
Victor Stinner488fa492011-12-12 00:01:39 +010010593 u_len = PyUnicode_GET_LENGTH(u);
10594 v_len = PyUnicode_GET_LENGTH(v);
10595 if (u_len > PY_SSIZE_T_MAX - v_len) {
10596 PyErr_SetString(PyExc_OverflowError,
10597 "strings are too large to concat");
10598 goto onError;
10599 }
10600 new_len = u_len + v_len;
10601
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010602 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010603 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
10604 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010605
Guido van Rossumd57fd912000-03-10 22:53:23 +000010606 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010010607 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010608 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010609 goto onError;
Victor Stinner488fa492011-12-12 00:01:39 +010010610 copy_characters(w, 0, u, 0, u_len);
10611 copy_characters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010612 Py_DECREF(u);
10613 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010614 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010615 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010616
Benjamin Peterson29060642009-01-31 22:14:21 +000010617 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010618 Py_XDECREF(u);
10619 Py_XDECREF(v);
10620 return NULL;
10621}
10622
Walter Dörwald1ab83302007-05-18 17:15:44 +000010623void
Victor Stinner23e56682011-10-03 03:54:37 +020010624PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010625{
Victor Stinner23e56682011-10-03 03:54:37 +020010626 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010010627 Py_UCS4 maxchar, maxchar2;
10628 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020010629
10630 if (p_left == NULL) {
10631 if (!PyErr_Occurred())
10632 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010633 return;
10634 }
Victor Stinner23e56682011-10-03 03:54:37 +020010635 left = *p_left;
10636 if (right == NULL || !PyUnicode_Check(left)) {
10637 if (!PyErr_Occurred())
10638 PyErr_BadInternalCall();
10639 goto error;
10640 }
10641
Victor Stinnere1335c72011-10-04 20:53:03 +020010642 if (PyUnicode_READY(left))
10643 goto error;
10644 if (PyUnicode_READY(right))
10645 goto error;
10646
Victor Stinner488fa492011-12-12 00:01:39 +010010647 /* Shortcuts */
10648 if (left == unicode_empty) {
10649 Py_DECREF(left);
10650 Py_INCREF(right);
10651 *p_left = right;
10652 return;
10653 }
10654 if (right == unicode_empty)
10655 return;
10656
10657 left_len = PyUnicode_GET_LENGTH(left);
10658 right_len = PyUnicode_GET_LENGTH(right);
10659 if (left_len > PY_SSIZE_T_MAX - right_len) {
10660 PyErr_SetString(PyExc_OverflowError,
10661 "strings are too large to concat");
10662 goto error;
10663 }
10664 new_len = left_len + right_len;
10665
10666 if (unicode_modifiable(left)
10667 && PyUnicode_CheckExact(right)
10668 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020010669 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10670 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010671 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010672 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010010673 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10674 {
10675 /* append inplace */
10676 if (unicode_resize(p_left, new_len) != 0) {
10677 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10678 * deallocated so it cannot be put back into
10679 * 'variable'. The MemoryError is raised when there
10680 * is no value in 'variable', which might (very
10681 * remotely) be a cause of incompatibilities.
10682 */
10683 goto error;
Victor Stinner23e56682011-10-03 03:54:37 +020010684 }
Victor Stinner488fa492011-12-12 00:01:39 +010010685 /* copy 'right' into the newly allocated area of 'left' */
10686 copy_characters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020010687 }
Victor Stinner488fa492011-12-12 00:01:39 +010010688 else {
10689 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
10690 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
10691 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020010692
Victor Stinner488fa492011-12-12 00:01:39 +010010693 /* Concat the two Unicode strings */
10694 res = PyUnicode_New(new_len, maxchar);
10695 if (res == NULL)
10696 goto error;
10697 copy_characters(res, 0, left, 0, left_len);
10698 copy_characters(res, left_len, right, 0, right_len);
10699 Py_DECREF(left);
10700 *p_left = res;
10701 }
10702 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010703 return;
10704
10705error:
Victor Stinner488fa492011-12-12 00:01:39 +010010706 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010707}
10708
10709void
10710PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10711{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010712 PyUnicode_Append(pleft, right);
10713 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010714}
10715
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010716PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010717 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010718\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010719Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010720string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010721interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010722
10723static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010724unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010725{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010726 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010727 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010728 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010729 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010730 int kind1, kind2, kind;
10731 void *buf1, *buf2;
10732 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010733
Jesus Ceaac451502011-04-20 17:09:23 +020010734 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10735 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010736 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010737
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010738 kind1 = PyUnicode_KIND(self);
10739 kind2 = PyUnicode_KIND(substring);
10740 kind = kind1 > kind2 ? kind1 : kind2;
10741 buf1 = PyUnicode_DATA(self);
10742 buf2 = PyUnicode_DATA(substring);
10743 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010744 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010745 if (!buf1) {
10746 Py_DECREF(substring);
10747 return NULL;
10748 }
10749 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010750 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010751 if (!buf2) {
10752 Py_DECREF(substring);
10753 if (kind1 != kind) PyMem_Free(buf1);
10754 return NULL;
10755 }
10756 len1 = PyUnicode_GET_LENGTH(self);
10757 len2 = PyUnicode_GET_LENGTH(substring);
10758
10759 ADJUST_INDICES(start, end, len1);
10760 switch(kind) {
10761 case PyUnicode_1BYTE_KIND:
10762 iresult = ucs1lib_count(
10763 ((Py_UCS1*)buf1) + start, end - start,
10764 buf2, len2, PY_SSIZE_T_MAX
10765 );
10766 break;
10767 case PyUnicode_2BYTE_KIND:
10768 iresult = ucs2lib_count(
10769 ((Py_UCS2*)buf1) + start, end - start,
10770 buf2, len2, PY_SSIZE_T_MAX
10771 );
10772 break;
10773 case PyUnicode_4BYTE_KIND:
10774 iresult = ucs4lib_count(
10775 ((Py_UCS4*)buf1) + start, end - start,
10776 buf2, len2, PY_SSIZE_T_MAX
10777 );
10778 break;
10779 default:
10780 assert(0); iresult = 0;
10781 }
10782
10783 result = PyLong_FromSsize_t(iresult);
10784
10785 if (kind1 != kind)
10786 PyMem_Free(buf1);
10787 if (kind2 != kind)
10788 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010789
10790 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010791
Guido van Rossumd57fd912000-03-10 22:53:23 +000010792 return result;
10793}
10794
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010795PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010796 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010797\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010798Encode S using the codec registered for encoding. Default encoding\n\
10799is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010800handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010801a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10802'xmlcharrefreplace' as well as any other name registered with\n\
10803codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010804
10805static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010806unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010807{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010808 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010809 char *encoding = NULL;
10810 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010811
Benjamin Peterson308d6372009-09-18 21:42:35 +000010812 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10813 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010814 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010815 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010816}
10817
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010818PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010819 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010820\n\
10821Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010822If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010823
10824static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010825unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010826{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010827 Py_ssize_t i, j, line_pos, src_len, incr;
10828 Py_UCS4 ch;
10829 PyObject *u;
10830 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010831 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010832 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010833 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010834
10835 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010836 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010837
Antoine Pitrou22425222011-10-04 19:10:51 +020010838 if (PyUnicode_READY(self) == -1)
10839 return NULL;
10840
Thomas Wouters7e474022000-07-16 12:04:32 +000010841 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010842 src_len = PyUnicode_GET_LENGTH(self);
10843 i = j = line_pos = 0;
10844 kind = PyUnicode_KIND(self);
10845 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010846 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010847 for (; i < src_len; i++) {
10848 ch = PyUnicode_READ(kind, src_data, i);
10849 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010850 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010851 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010852 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010853 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010854 goto overflow;
10855 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010856 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010857 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010858 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010859 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010860 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010861 goto overflow;
10862 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010863 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010864 if (ch == '\n' || ch == '\r')
10865 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010866 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010867 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010010868 if (!found)
10869 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010870
Guido van Rossumd57fd912000-03-10 22:53:23 +000010871 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010872 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010873 if (!u)
10874 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010875 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010876
Antoine Pitroue71d5742011-10-04 15:55:09 +020010877 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010878
Antoine Pitroue71d5742011-10-04 15:55:09 +020010879 for (; i < src_len; i++) {
10880 ch = PyUnicode_READ(kind, src_data, i);
10881 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010882 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010883 incr = tabsize - (line_pos % tabsize);
10884 line_pos += incr;
10885 while (incr--) {
10886 PyUnicode_WRITE(kind, dest_data, j, ' ');
10887 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010888 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010889 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010890 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010891 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010892 line_pos++;
10893 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010894 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010895 if (ch == '\n' || ch == '\r')
10896 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010897 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010898 }
10899 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010010900 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010901
Antoine Pitroue71d5742011-10-04 15:55:09 +020010902 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010903 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10904 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010905}
10906
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010907PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010908 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010909\n\
10910Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010911such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010912arguments start and end are interpreted as in slice notation.\n\
10913\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010914Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010915
10916static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010917unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010918{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010919 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010920 Py_ssize_t start;
10921 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010922 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010923
Jesus Ceaac451502011-04-20 17:09:23 +020010924 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10925 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010926 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010927
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010928 if (PyUnicode_READY(self) == -1)
10929 return NULL;
10930 if (PyUnicode_READY(substring) == -1)
10931 return NULL;
10932
Victor Stinner7931d9a2011-11-04 00:22:48 +010010933 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010934
10935 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010936
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010937 if (result == -2)
10938 return NULL;
10939
Christian Heimes217cfd12007-12-02 14:31:20 +000010940 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010941}
10942
10943static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010944unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010945{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010946 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10947 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010948 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010949 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010950}
10951
Guido van Rossumc2504932007-09-18 19:42:40 +000010952/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010953 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010954static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010955unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010956{
Guido van Rossumc2504932007-09-18 19:42:40 +000010957 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010010958 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010959
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010960 if (_PyUnicode_HASH(self) != -1)
10961 return _PyUnicode_HASH(self);
10962 if (PyUnicode_READY(self) == -1)
10963 return -1;
10964 len = PyUnicode_GET_LENGTH(self);
10965
10966 /* The hash function as a macro, gets expanded three times below. */
10967#define HASH(P) \
10968 x = (Py_uhash_t)*P << 7; \
10969 while (--len >= 0) \
10970 x = (1000003*x) ^ (Py_uhash_t)*P++;
10971
10972 switch (PyUnicode_KIND(self)) {
10973 case PyUnicode_1BYTE_KIND: {
10974 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10975 HASH(c);
10976 break;
10977 }
10978 case PyUnicode_2BYTE_KIND: {
10979 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10980 HASH(s);
10981 break;
10982 }
10983 default: {
10984 Py_UCS4 *l;
10985 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10986 "Impossible switch case in unicode_hash");
10987 l = PyUnicode_4BYTE_DATA(self);
10988 HASH(l);
10989 break;
10990 }
10991 }
10992 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
10993
Guido van Rossumc2504932007-09-18 19:42:40 +000010994 if (x == -1)
10995 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010996 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010997 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010998}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010999#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011000
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011001PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011002 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011003\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011004Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011005
11006static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011007unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011008{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011009 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011010 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011011 Py_ssize_t start;
11012 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011013
Jesus Ceaac451502011-04-20 17:09:23 +020011014 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11015 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011016 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011017
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011018 if (PyUnicode_READY(self) == -1)
11019 return NULL;
11020 if (PyUnicode_READY(substring) == -1)
11021 return NULL;
11022
Victor Stinner7931d9a2011-11-04 00:22:48 +010011023 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011024
11025 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011026
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011027 if (result == -2)
11028 return NULL;
11029
Guido van Rossumd57fd912000-03-10 22:53:23 +000011030 if (result < 0) {
11031 PyErr_SetString(PyExc_ValueError, "substring not found");
11032 return NULL;
11033 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011034
Christian Heimes217cfd12007-12-02 14:31:20 +000011035 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011036}
11037
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011038PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011039 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011040\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011041Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011042at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011043
11044static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011045unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011046{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011047 Py_ssize_t i, length;
11048 int kind;
11049 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011050 int cased;
11051
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011052 if (PyUnicode_READY(self) == -1)
11053 return NULL;
11054 length = PyUnicode_GET_LENGTH(self);
11055 kind = PyUnicode_KIND(self);
11056 data = PyUnicode_DATA(self);
11057
Guido van Rossumd57fd912000-03-10 22:53:23 +000011058 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011059 if (length == 1)
11060 return PyBool_FromLong(
11061 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011062
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011063 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011064 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011065 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011066
Guido van Rossumd57fd912000-03-10 22:53:23 +000011067 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011068 for (i = 0; i < length; i++) {
11069 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011070
Benjamin Peterson29060642009-01-31 22:14:21 +000011071 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11072 return PyBool_FromLong(0);
11073 else if (!cased && Py_UNICODE_ISLOWER(ch))
11074 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011075 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011076 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011077}
11078
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011079PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011080 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011081\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011082Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011083at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011084
11085static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011086unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011087{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011088 Py_ssize_t i, length;
11089 int kind;
11090 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011091 int cased;
11092
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011093 if (PyUnicode_READY(self) == -1)
11094 return NULL;
11095 length = PyUnicode_GET_LENGTH(self);
11096 kind = PyUnicode_KIND(self);
11097 data = PyUnicode_DATA(self);
11098
Guido van Rossumd57fd912000-03-10 22:53:23 +000011099 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011100 if (length == 1)
11101 return PyBool_FromLong(
11102 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011103
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011104 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011105 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011106 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011107
Guido van Rossumd57fd912000-03-10 22:53:23 +000011108 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011109 for (i = 0; i < length; i++) {
11110 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011111
Benjamin Peterson29060642009-01-31 22:14:21 +000011112 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11113 return PyBool_FromLong(0);
11114 else if (!cased && Py_UNICODE_ISUPPER(ch))
11115 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011116 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011117 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011118}
11119
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011120PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011121 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011122\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011123Return True if S is a titlecased string and there is at least one\n\
11124character in S, i.e. upper- and titlecase characters may only\n\
11125follow uncased characters and lowercase characters only cased ones.\n\
11126Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011127
11128static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011129unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011130{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011131 Py_ssize_t i, length;
11132 int kind;
11133 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011134 int cased, previous_is_cased;
11135
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011136 if (PyUnicode_READY(self) == -1)
11137 return NULL;
11138 length = PyUnicode_GET_LENGTH(self);
11139 kind = PyUnicode_KIND(self);
11140 data = PyUnicode_DATA(self);
11141
Guido van Rossumd57fd912000-03-10 22:53:23 +000011142 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011143 if (length == 1) {
11144 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11145 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11146 (Py_UNICODE_ISUPPER(ch) != 0));
11147 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011148
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011149 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011150 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011151 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011152
Guido van Rossumd57fd912000-03-10 22:53:23 +000011153 cased = 0;
11154 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011155 for (i = 0; i < length; i++) {
11156 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011157
Benjamin Peterson29060642009-01-31 22:14:21 +000011158 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11159 if (previous_is_cased)
11160 return PyBool_FromLong(0);
11161 previous_is_cased = 1;
11162 cased = 1;
11163 }
11164 else if (Py_UNICODE_ISLOWER(ch)) {
11165 if (!previous_is_cased)
11166 return PyBool_FromLong(0);
11167 previous_is_cased = 1;
11168 cased = 1;
11169 }
11170 else
11171 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011172 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011173 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011174}
11175
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011176PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011177 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011178\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011179Return True if all characters in S are whitespace\n\
11180and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011181
11182static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011183unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011184{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011185 Py_ssize_t i, length;
11186 int kind;
11187 void *data;
11188
11189 if (PyUnicode_READY(self) == -1)
11190 return NULL;
11191 length = PyUnicode_GET_LENGTH(self);
11192 kind = PyUnicode_KIND(self);
11193 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011194
Guido van Rossumd57fd912000-03-10 22:53:23 +000011195 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011196 if (length == 1)
11197 return PyBool_FromLong(
11198 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011199
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011200 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011201 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011202 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011203
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011204 for (i = 0; i < length; i++) {
11205 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011206 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011207 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011208 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011209 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011210}
11211
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011212PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011213 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011214\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011215Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011216and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011217
11218static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011219unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011220{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011221 Py_ssize_t i, length;
11222 int kind;
11223 void *data;
11224
11225 if (PyUnicode_READY(self) == -1)
11226 return NULL;
11227 length = PyUnicode_GET_LENGTH(self);
11228 kind = PyUnicode_KIND(self);
11229 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011230
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011231 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011232 if (length == 1)
11233 return PyBool_FromLong(
11234 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011235
11236 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011237 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011238 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011239
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011240 for (i = 0; i < length; i++) {
11241 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011242 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011243 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011244 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011245}
11246
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011247PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011248 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011249\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011250Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011251and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011252
11253static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011254unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011255{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011256 int kind;
11257 void *data;
11258 Py_ssize_t len, i;
11259
11260 if (PyUnicode_READY(self) == -1)
11261 return NULL;
11262
11263 kind = PyUnicode_KIND(self);
11264 data = PyUnicode_DATA(self);
11265 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011266
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011267 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011268 if (len == 1) {
11269 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11270 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11271 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011272
11273 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011274 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011275 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011276
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011277 for (i = 0; i < len; i++) {
11278 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011279 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011280 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011281 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011282 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011283}
11284
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011285PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011286 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011287\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011288Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011289False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011290
11291static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011292unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011293{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011294 Py_ssize_t i, length;
11295 int kind;
11296 void *data;
11297
11298 if (PyUnicode_READY(self) == -1)
11299 return NULL;
11300 length = PyUnicode_GET_LENGTH(self);
11301 kind = PyUnicode_KIND(self);
11302 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011303
Guido van Rossumd57fd912000-03-10 22:53:23 +000011304 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011305 if (length == 1)
11306 return PyBool_FromLong(
11307 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011308
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011309 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011310 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011311 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011312
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011313 for (i = 0; i < length; i++) {
11314 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011315 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011316 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011317 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011318}
11319
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011320PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011321 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011322\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011323Return True if all characters in S are digits\n\
11324and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011325
11326static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011327unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011328{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011329 Py_ssize_t i, length;
11330 int kind;
11331 void *data;
11332
11333 if (PyUnicode_READY(self) == -1)
11334 return NULL;
11335 length = PyUnicode_GET_LENGTH(self);
11336 kind = PyUnicode_KIND(self);
11337 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011338
Guido van Rossumd57fd912000-03-10 22:53:23 +000011339 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011340 if (length == 1) {
11341 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11342 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11343 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011344
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011345 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011346 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011347 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011348
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011349 for (i = 0; i < length; i++) {
11350 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011351 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011352 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011353 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011354}
11355
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011356PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011357 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011358\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011359Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011360False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011361
11362static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011363unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011364{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011365 Py_ssize_t i, length;
11366 int kind;
11367 void *data;
11368
11369 if (PyUnicode_READY(self) == -1)
11370 return NULL;
11371 length = PyUnicode_GET_LENGTH(self);
11372 kind = PyUnicode_KIND(self);
11373 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011374
Guido van Rossumd57fd912000-03-10 22:53:23 +000011375 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011376 if (length == 1)
11377 return PyBool_FromLong(
11378 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011379
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011380 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011381 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011382 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011383
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011384 for (i = 0; i < length; i++) {
11385 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011386 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011387 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011388 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011389}
11390
Martin v. Löwis47383402007-08-15 07:32:56 +000011391int
11392PyUnicode_IsIdentifier(PyObject *self)
11393{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011394 int kind;
11395 void *data;
11396 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011397 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011398
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011399 if (PyUnicode_READY(self) == -1) {
11400 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011401 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011402 }
11403
11404 /* Special case for empty strings */
11405 if (PyUnicode_GET_LENGTH(self) == 0)
11406 return 0;
11407 kind = PyUnicode_KIND(self);
11408 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011409
11410 /* PEP 3131 says that the first character must be in
11411 XID_Start and subsequent characters in XID_Continue,
11412 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011413 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011414 letters, digits, underscore). However, given the current
11415 definition of XID_Start and XID_Continue, it is sufficient
11416 to check just for these, except that _ must be allowed
11417 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011418 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011419 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011420 return 0;
11421
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011422 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011423 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011424 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011425 return 1;
11426}
11427
11428PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011429 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011430\n\
11431Return True if S is a valid identifier according\n\
11432to the language definition.");
11433
11434static PyObject*
11435unicode_isidentifier(PyObject *self)
11436{
11437 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11438}
11439
Georg Brandl559e5d72008-06-11 18:37:52 +000011440PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011441 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011442\n\
11443Return True if all characters in S are considered\n\
11444printable in repr() or S is empty, False otherwise.");
11445
11446static PyObject*
11447unicode_isprintable(PyObject *self)
11448{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011449 Py_ssize_t i, length;
11450 int kind;
11451 void *data;
11452
11453 if (PyUnicode_READY(self) == -1)
11454 return NULL;
11455 length = PyUnicode_GET_LENGTH(self);
11456 kind = PyUnicode_KIND(self);
11457 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011458
11459 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011460 if (length == 1)
11461 return PyBool_FromLong(
11462 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011463
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011464 for (i = 0; i < length; i++) {
11465 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011466 Py_RETURN_FALSE;
11467 }
11468 }
11469 Py_RETURN_TRUE;
11470}
11471
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011472PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011473 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011474\n\
11475Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011476iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011477
11478static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011479unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011480{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011481 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011482}
11483
Martin v. Löwis18e16552006-02-15 17:27:45 +000011484static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011485unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011486{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011487 if (PyUnicode_READY(self) == -1)
11488 return -1;
11489 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011490}
11491
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011492PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011493 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011494\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011495Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011496done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011497
11498static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011499unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011500{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011501 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011502 Py_UCS4 fillchar = ' ';
11503
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011504 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011505 return NULL;
11506
Victor Stinnerc4b49542011-12-11 22:44:26 +010011507 if (PyUnicode_READY(self) < 0)
11508 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011509
Victor Stinnerc4b49542011-12-11 22:44:26 +010011510 if (PyUnicode_GET_LENGTH(self) >= width)
11511 return unicode_result_unchanged(self);
11512
11513 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011514}
11515
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011516PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011517 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011518\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011519Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011520
11521static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011522unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011523{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011524 return fixup(self, fixlower);
11525}
11526
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011527#define LEFTSTRIP 0
11528#define RIGHTSTRIP 1
11529#define BOTHSTRIP 2
11530
11531/* Arrays indexed by above */
11532static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11533
11534#define STRIPNAME(i) (stripformat[i]+3)
11535
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011536/* externally visible for str.strip(unicode) */
11537PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011538_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011539{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011540 void *data;
11541 int kind;
11542 Py_ssize_t i, j, len;
11543 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011544
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011545 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11546 return NULL;
11547
11548 kind = PyUnicode_KIND(self);
11549 data = PyUnicode_DATA(self);
11550 len = PyUnicode_GET_LENGTH(self);
11551 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11552 PyUnicode_DATA(sepobj),
11553 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011554
Benjamin Peterson14339b62009-01-31 16:36:08 +000011555 i = 0;
11556 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011557 while (i < len &&
11558 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011559 i++;
11560 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011561 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011562
Benjamin Peterson14339b62009-01-31 16:36:08 +000011563 j = len;
11564 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011565 do {
11566 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011567 } while (j >= i &&
11568 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011569 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011570 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011571
Victor Stinner7931d9a2011-11-04 00:22:48 +010011572 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011573}
11574
11575PyObject*
11576PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11577{
11578 unsigned char *data;
11579 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011580 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011581
Victor Stinnerde636f32011-10-01 03:55:54 +020011582 if (PyUnicode_READY(self) == -1)
11583 return NULL;
11584
11585 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11586
Victor Stinner12bab6d2011-10-01 01:53:49 +020011587 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Victor Stinnerc4b49542011-12-11 22:44:26 +010011588 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011589
Victor Stinner12bab6d2011-10-01 01:53:49 +020011590 length = end - start;
11591 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011592 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011593
Victor Stinnerde636f32011-10-01 03:55:54 +020011594 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011595 PyErr_SetString(PyExc_IndexError, "string index out of range");
11596 return NULL;
11597 }
11598
Victor Stinnerb9275c12011-10-05 14:01:42 +020011599 if (PyUnicode_IS_ASCII(self)) {
11600 kind = PyUnicode_KIND(self);
11601 data = PyUnicode_1BYTE_DATA(self);
11602 return unicode_fromascii(data + start, length);
11603 }
11604 else {
11605 kind = PyUnicode_KIND(self);
11606 data = PyUnicode_1BYTE_DATA(self);
11607 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011608 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011609 length);
11610 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011611}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011612
11613static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011614do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011615{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011616 int kind;
11617 void *data;
11618 Py_ssize_t len, i, j;
11619
11620 if (PyUnicode_READY(self) == -1)
11621 return NULL;
11622
11623 kind = PyUnicode_KIND(self);
11624 data = PyUnicode_DATA(self);
11625 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011626
Benjamin Peterson14339b62009-01-31 16:36:08 +000011627 i = 0;
11628 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011629 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011630 i++;
11631 }
11632 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011633
Benjamin Peterson14339b62009-01-31 16:36:08 +000011634 j = len;
11635 if (striptype != LEFTSTRIP) {
11636 do {
11637 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011638 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011639 j++;
11640 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011641
Victor Stinner7931d9a2011-11-04 00:22:48 +010011642 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011643}
11644
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011645
11646static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011647do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011648{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011649 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011650
Benjamin Peterson14339b62009-01-31 16:36:08 +000011651 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11652 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011653
Benjamin Peterson14339b62009-01-31 16:36:08 +000011654 if (sep != NULL && sep != Py_None) {
11655 if (PyUnicode_Check(sep))
11656 return _PyUnicode_XStrip(self, striptype, sep);
11657 else {
11658 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011659 "%s arg must be None or str",
11660 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011661 return NULL;
11662 }
11663 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011664
Benjamin Peterson14339b62009-01-31 16:36:08 +000011665 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011666}
11667
11668
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011669PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011670 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011671\n\
11672Return a copy of the string S with leading and trailing\n\
11673whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011674If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011675
11676static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011677unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011678{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011679 if (PyTuple_GET_SIZE(args) == 0)
11680 return do_strip(self, BOTHSTRIP); /* Common case */
11681 else
11682 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011683}
11684
11685
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011686PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011687 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011688\n\
11689Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011690If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011691
11692static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011693unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011694{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011695 if (PyTuple_GET_SIZE(args) == 0)
11696 return do_strip(self, LEFTSTRIP); /* Common case */
11697 else
11698 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011699}
11700
11701
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011702PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011703 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011704\n\
11705Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011706If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011707
11708static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011709unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011710{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011711 if (PyTuple_GET_SIZE(args) == 0)
11712 return do_strip(self, RIGHTSTRIP); /* Common case */
11713 else
11714 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011715}
11716
11717
Guido van Rossumd57fd912000-03-10 22:53:23 +000011718static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011719unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011720{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011721 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011722 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011723
Georg Brandl222de0f2009-04-12 12:01:50 +000011724 if (len < 1) {
11725 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011726 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011727 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011728
Victor Stinnerc4b49542011-12-11 22:44:26 +010011729 /* no repeat, return original string */
11730 if (len == 1)
11731 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000011732
Victor Stinnerc4b49542011-12-11 22:44:26 +010011733 if (PyUnicode_READY(str) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011734 return NULL;
11735
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011736 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011737 PyErr_SetString(PyExc_OverflowError,
11738 "repeated string is too long");
11739 return NULL;
11740 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011741 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011742
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011743 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011744 if (!u)
11745 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011746 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011747
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011748 if (PyUnicode_GET_LENGTH(str) == 1) {
11749 const int kind = PyUnicode_KIND(str);
11750 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11751 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011752 if (kind == PyUnicode_1BYTE_KIND)
11753 memset(to, (unsigned char)fill_char, len);
11754 else {
11755 for (n = 0; n < len; ++n)
11756 PyUnicode_WRITE(kind, to, n, fill_char);
11757 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011758 }
11759 else {
11760 /* number of characters copied this far */
11761 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011762 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011763 char *to = (char *) PyUnicode_DATA(u);
11764 Py_MEMCPY(to, PyUnicode_DATA(str),
11765 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011766 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011767 n = (done <= nchars-done) ? done : nchars-done;
11768 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011769 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011770 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011771 }
11772
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011773 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011774 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011775}
11776
Alexander Belopolsky40018472011-02-26 01:02:56 +000011777PyObject *
11778PyUnicode_Replace(PyObject *obj,
11779 PyObject *subobj,
11780 PyObject *replobj,
11781 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011782{
11783 PyObject *self;
11784 PyObject *str1;
11785 PyObject *str2;
11786 PyObject *result;
11787
11788 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011789 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011790 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011791 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011792 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011793 Py_DECREF(self);
11794 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011795 }
11796 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011797 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011798 Py_DECREF(self);
11799 Py_DECREF(str1);
11800 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011801 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011802 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011803 Py_DECREF(self);
11804 Py_DECREF(str1);
11805 Py_DECREF(str2);
11806 return result;
11807}
11808
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011809PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011810 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011811\n\
11812Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011813old replaced by new. If the optional argument count is\n\
11814given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011815
11816static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011817unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011818{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011819 PyObject *str1;
11820 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011821 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011822 PyObject *result;
11823
Martin v. Löwis18e16552006-02-15 17:27:45 +000011824 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011825 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011826 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011827 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011828 str1 = PyUnicode_FromObject(str1);
11829 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11830 return NULL;
11831 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011832 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011833 Py_DECREF(str1);
11834 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011835 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011836
11837 result = replace(self, str1, str2, maxcount);
11838
11839 Py_DECREF(str1);
11840 Py_DECREF(str2);
11841 return result;
11842}
11843
Alexander Belopolsky40018472011-02-26 01:02:56 +000011844static PyObject *
11845unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011846{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011847 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011848 Py_ssize_t isize;
11849 Py_ssize_t osize, squote, dquote, i, o;
11850 Py_UCS4 max, quote;
11851 int ikind, okind;
11852 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011853
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011854 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011855 return NULL;
11856
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011857 isize = PyUnicode_GET_LENGTH(unicode);
11858 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011859
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011860 /* Compute length of output, quote characters, and
11861 maximum character */
11862 osize = 2; /* quotes */
11863 max = 127;
11864 squote = dquote = 0;
11865 ikind = PyUnicode_KIND(unicode);
11866 for (i = 0; i < isize; i++) {
11867 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11868 switch (ch) {
11869 case '\'': squote++; osize++; break;
11870 case '"': dquote++; osize++; break;
11871 case '\\': case '\t': case '\r': case '\n':
11872 osize += 2; break;
11873 default:
11874 /* Fast-path ASCII */
11875 if (ch < ' ' || ch == 0x7f)
11876 osize += 4; /* \xHH */
11877 else if (ch < 0x7f)
11878 osize++;
11879 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11880 osize++;
11881 max = ch > max ? ch : max;
11882 }
11883 else if (ch < 0x100)
11884 osize += 4; /* \xHH */
11885 else if (ch < 0x10000)
11886 osize += 6; /* \uHHHH */
11887 else
11888 osize += 10; /* \uHHHHHHHH */
11889 }
11890 }
11891
11892 quote = '\'';
11893 if (squote) {
11894 if (dquote)
11895 /* Both squote and dquote present. Use squote,
11896 and escape them */
11897 osize += squote;
11898 else
11899 quote = '"';
11900 }
11901
11902 repr = PyUnicode_New(osize, max);
11903 if (repr == NULL)
11904 return NULL;
11905 okind = PyUnicode_KIND(repr);
11906 odata = PyUnicode_DATA(repr);
11907
11908 PyUnicode_WRITE(okind, odata, 0, quote);
11909 PyUnicode_WRITE(okind, odata, osize-1, quote);
11910
11911 for (i = 0, o = 1; i < isize; i++) {
11912 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011913
11914 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011915 if ((ch == quote) || (ch == '\\')) {
11916 PyUnicode_WRITE(okind, odata, o++, '\\');
11917 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011918 continue;
11919 }
11920
Benjamin Peterson29060642009-01-31 22:14:21 +000011921 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011922 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011923 PyUnicode_WRITE(okind, odata, o++, '\\');
11924 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011925 }
11926 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011927 PyUnicode_WRITE(okind, odata, o++, '\\');
11928 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011929 }
11930 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011931 PyUnicode_WRITE(okind, odata, o++, '\\');
11932 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011933 }
11934
11935 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011936 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011937 PyUnicode_WRITE(okind, odata, o++, '\\');
11938 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011939 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
11940 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011941 }
11942
Georg Brandl559e5d72008-06-11 18:37:52 +000011943 /* Copy ASCII characters as-is */
11944 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011945 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011946 }
11947
Benjamin Peterson29060642009-01-31 22:14:21 +000011948 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011949 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011950 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011951 (categories Z* and C* except ASCII space)
11952 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011953 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011954 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011955 if (ch <= 0xff) {
11956 PyUnicode_WRITE(okind, odata, o++, '\\');
11957 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011958 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
11959 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011960 }
11961 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011962 else if (ch >= 0x10000) {
11963 PyUnicode_WRITE(okind, odata, o++, '\\');
11964 PyUnicode_WRITE(okind, odata, o++, 'U');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011965 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
11966 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
11967 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
11968 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
11969 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
11970 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
11971 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
11972 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011973 }
11974 /* Map 16-bit characters to '\uxxxx' */
11975 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011976 PyUnicode_WRITE(okind, odata, o++, '\\');
11977 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011978 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
11979 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
11980 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
11981 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011982 }
11983 }
11984 /* Copy characters as-is */
11985 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011986 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011987 }
11988 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000011989 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011990 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020011991 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000011992 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011993}
11994
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011995PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011996 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011997\n\
11998Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011999such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012000arguments start and end are interpreted as in slice notation.\n\
12001\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012002Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012003
12004static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012005unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012006{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012007 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012008 Py_ssize_t start;
12009 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012010 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012011
Jesus Ceaac451502011-04-20 17:09:23 +020012012 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12013 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012014 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012015
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012016 if (PyUnicode_READY(self) == -1)
12017 return NULL;
12018 if (PyUnicode_READY(substring) == -1)
12019 return NULL;
12020
Victor Stinner7931d9a2011-11-04 00:22:48 +010012021 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012022
12023 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012024
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012025 if (result == -2)
12026 return NULL;
12027
Christian Heimes217cfd12007-12-02 14:31:20 +000012028 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012029}
12030
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012031PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012032 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012033\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012034Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012035
12036static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012037unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012038{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012039 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012040 Py_ssize_t start;
12041 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012042 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012043
Jesus Ceaac451502011-04-20 17:09:23 +020012044 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12045 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012046 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012047
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012048 if (PyUnicode_READY(self) == -1)
12049 return NULL;
12050 if (PyUnicode_READY(substring) == -1)
12051 return NULL;
12052
Victor Stinner7931d9a2011-11-04 00:22:48 +010012053 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012054
12055 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012056
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012057 if (result == -2)
12058 return NULL;
12059
Guido van Rossumd57fd912000-03-10 22:53:23 +000012060 if (result < 0) {
12061 PyErr_SetString(PyExc_ValueError, "substring not found");
12062 return NULL;
12063 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012064
Christian Heimes217cfd12007-12-02 14:31:20 +000012065 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012066}
12067
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012068PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012069 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012070\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012071Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012072done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012073
12074static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012075unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012076{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012077 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012078 Py_UCS4 fillchar = ' ';
12079
Victor Stinnere9a29352011-10-01 02:14:59 +020012080 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012081 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012082
Victor Stinnerc4b49542011-12-11 22:44:26 +010012083 if (PyUnicode_READY(self) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012084 return NULL;
12085
Victor Stinnerc4b49542011-12-11 22:44:26 +010012086 if (PyUnicode_GET_LENGTH(self) >= width)
12087 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012088
Victor Stinnerc4b49542011-12-11 22:44:26 +010012089 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012090}
12091
Alexander Belopolsky40018472011-02-26 01:02:56 +000012092PyObject *
12093PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012094{
12095 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012096
Guido van Rossumd57fd912000-03-10 22:53:23 +000012097 s = PyUnicode_FromObject(s);
12098 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012099 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012100 if (sep != NULL) {
12101 sep = PyUnicode_FromObject(sep);
12102 if (sep == NULL) {
12103 Py_DECREF(s);
12104 return NULL;
12105 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012106 }
12107
Victor Stinner9310abb2011-10-05 00:59:23 +020012108 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012109
12110 Py_DECREF(s);
12111 Py_XDECREF(sep);
12112 return result;
12113}
12114
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012115PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012116 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012117\n\
12118Return a list of the words in S, using sep as the\n\
12119delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012120splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012121whitespace string is a separator and empty strings are\n\
12122removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012123
12124static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012125unicode_split(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012126{
12127 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012128 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012129
Martin v. Löwis18e16552006-02-15 17:27:45 +000012130 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012131 return NULL;
12132
12133 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012134 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012135 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012136 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012137 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012138 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012139}
12140
Thomas Wouters477c8d52006-05-27 19:21:47 +000012141PyObject *
12142PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12143{
12144 PyObject* str_obj;
12145 PyObject* sep_obj;
12146 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012147 int kind1, kind2, kind;
12148 void *buf1 = NULL, *buf2 = NULL;
12149 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012150
12151 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020012152 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012153 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012154 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012155 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012156 Py_DECREF(str_obj);
12157 return NULL;
12158 }
12159
Victor Stinner14f8f022011-10-05 20:58:25 +020012160 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012161 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012162 kind = Py_MAX(kind1, kind2);
12163 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012164 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012165 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012166 if (!buf1)
12167 goto onError;
12168 buf2 = PyUnicode_DATA(sep_obj);
12169 if (kind2 != kind)
12170 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12171 if (!buf2)
12172 goto onError;
12173 len1 = PyUnicode_GET_LENGTH(str_obj);
12174 len2 = PyUnicode_GET_LENGTH(sep_obj);
12175
Victor Stinner14f8f022011-10-05 20:58:25 +020012176 switch(PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012177 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012178 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12179 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12180 else
12181 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012182 break;
12183 case PyUnicode_2BYTE_KIND:
12184 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12185 break;
12186 case PyUnicode_4BYTE_KIND:
12187 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12188 break;
12189 default:
12190 assert(0);
12191 out = 0;
12192 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012193
12194 Py_DECREF(sep_obj);
12195 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012196 if (kind1 != kind)
12197 PyMem_Free(buf1);
12198 if (kind2 != kind)
12199 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012200
12201 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012202 onError:
12203 Py_DECREF(sep_obj);
12204 Py_DECREF(str_obj);
12205 if (kind1 != kind && buf1)
12206 PyMem_Free(buf1);
12207 if (kind2 != kind && buf2)
12208 PyMem_Free(buf2);
12209 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012210}
12211
12212
12213PyObject *
12214PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12215{
12216 PyObject* str_obj;
12217 PyObject* sep_obj;
12218 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012219 int kind1, kind2, kind;
12220 void *buf1 = NULL, *buf2 = NULL;
12221 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012222
12223 str_obj = PyUnicode_FromObject(str_in);
12224 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012225 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012226 sep_obj = PyUnicode_FromObject(sep_in);
12227 if (!sep_obj) {
12228 Py_DECREF(str_obj);
12229 return NULL;
12230 }
12231
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012232 kind1 = PyUnicode_KIND(str_in);
12233 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012234 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012235 buf1 = PyUnicode_DATA(str_in);
12236 if (kind1 != kind)
12237 buf1 = _PyUnicode_AsKind(str_in, kind);
12238 if (!buf1)
12239 goto onError;
12240 buf2 = PyUnicode_DATA(sep_obj);
12241 if (kind2 != kind)
12242 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12243 if (!buf2)
12244 goto onError;
12245 len1 = PyUnicode_GET_LENGTH(str_obj);
12246 len2 = PyUnicode_GET_LENGTH(sep_obj);
12247
12248 switch(PyUnicode_KIND(str_in)) {
12249 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012250 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12251 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12252 else
12253 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012254 break;
12255 case PyUnicode_2BYTE_KIND:
12256 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12257 break;
12258 case PyUnicode_4BYTE_KIND:
12259 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12260 break;
12261 default:
12262 assert(0);
12263 out = 0;
12264 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012265
12266 Py_DECREF(sep_obj);
12267 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012268 if (kind1 != kind)
12269 PyMem_Free(buf1);
12270 if (kind2 != kind)
12271 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012272
12273 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012274 onError:
12275 Py_DECREF(sep_obj);
12276 Py_DECREF(str_obj);
12277 if (kind1 != kind && buf1)
12278 PyMem_Free(buf1);
12279 if (kind2 != kind && buf2)
12280 PyMem_Free(buf2);
12281 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012282}
12283
12284PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012285 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012286\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012287Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012288the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012289found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012290
12291static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012292unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012293{
Victor Stinner9310abb2011-10-05 00:59:23 +020012294 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012295}
12296
12297PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012298 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012299\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012300Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012301the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012302separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012303
12304static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012305unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012306{
Victor Stinner9310abb2011-10-05 00:59:23 +020012307 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012308}
12309
Alexander Belopolsky40018472011-02-26 01:02:56 +000012310PyObject *
12311PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012312{
12313 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012314
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012315 s = PyUnicode_FromObject(s);
12316 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012317 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012318 if (sep != NULL) {
12319 sep = PyUnicode_FromObject(sep);
12320 if (sep == NULL) {
12321 Py_DECREF(s);
12322 return NULL;
12323 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012324 }
12325
Victor Stinner9310abb2011-10-05 00:59:23 +020012326 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012327
12328 Py_DECREF(s);
12329 Py_XDECREF(sep);
12330 return result;
12331}
12332
12333PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012334 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012335\n\
12336Return a list of the words in S, using sep as the\n\
12337delimiter string, starting at the end of the string and\n\
12338working to the front. If maxsplit is given, at most maxsplit\n\
12339splits are done. If sep is not specified, any whitespace string\n\
12340is a separator.");
12341
12342static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012343unicode_rsplit(PyObject *self, PyObject *args)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012344{
12345 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012346 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012347
Martin v. Löwis18e16552006-02-15 17:27:45 +000012348 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012349 return NULL;
12350
12351 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012352 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012353 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012354 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012355 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012356 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012357}
12358
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012359PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012360 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012361\n\
12362Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012363Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012364is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012365
12366static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012367unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012368{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012369 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012370 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012371
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012372 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12373 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012374 return NULL;
12375
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012376 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012377}
12378
12379static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012380PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012381{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012382 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012383}
12384
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012385PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012386 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012387\n\
12388Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012389and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012390
12391static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012392unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012393{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012394 return fixup(self, fixswapcase);
12395}
12396
Georg Brandlceee0772007-11-27 23:48:05 +000012397PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012398 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012399\n\
12400Return a translation table usable for str.translate().\n\
12401If there is only one argument, it must be a dictionary mapping Unicode\n\
12402ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012403Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012404If there are two arguments, they must be strings of equal length, and\n\
12405in the resulting dictionary, each character in x will be mapped to the\n\
12406character at the same position in y. If there is a third argument, it\n\
12407must be a string, whose characters will be mapped to None in the result.");
12408
12409static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012410unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012411{
12412 PyObject *x, *y = NULL, *z = NULL;
12413 PyObject *new = NULL, *key, *value;
12414 Py_ssize_t i = 0;
12415 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012416
Georg Brandlceee0772007-11-27 23:48:05 +000012417 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12418 return NULL;
12419 new = PyDict_New();
12420 if (!new)
12421 return NULL;
12422 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012423 int x_kind, y_kind, z_kind;
12424 void *x_data, *y_data, *z_data;
12425
Georg Brandlceee0772007-11-27 23:48:05 +000012426 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012427 if (!PyUnicode_Check(x)) {
12428 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12429 "be a string if there is a second argument");
12430 goto err;
12431 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012432 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012433 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12434 "arguments must have equal length");
12435 goto err;
12436 }
12437 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012438 x_kind = PyUnicode_KIND(x);
12439 y_kind = PyUnicode_KIND(y);
12440 x_data = PyUnicode_DATA(x);
12441 y_data = PyUnicode_DATA(y);
12442 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12443 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12444 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012445 if (!key || !value)
12446 goto err;
12447 res = PyDict_SetItem(new, key, value);
12448 Py_DECREF(key);
12449 Py_DECREF(value);
12450 if (res < 0)
12451 goto err;
12452 }
12453 /* create entries for deleting chars in z */
12454 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012455 z_kind = PyUnicode_KIND(z);
12456 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012457 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012458 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012459 if (!key)
12460 goto err;
12461 res = PyDict_SetItem(new, key, Py_None);
12462 Py_DECREF(key);
12463 if (res < 0)
12464 goto err;
12465 }
12466 }
12467 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012468 int kind;
12469 void *data;
12470
Georg Brandlceee0772007-11-27 23:48:05 +000012471 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012472 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012473 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12474 "to maketrans it must be a dict");
12475 goto err;
12476 }
12477 /* copy entries into the new dict, converting string keys to int keys */
12478 while (PyDict_Next(x, &i, &key, &value)) {
12479 if (PyUnicode_Check(key)) {
12480 /* convert string keys to integer keys */
12481 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012482 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012483 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12484 "table must be of length 1");
12485 goto err;
12486 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012487 kind = PyUnicode_KIND(key);
12488 data = PyUnicode_DATA(key);
12489 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012490 if (!newkey)
12491 goto err;
12492 res = PyDict_SetItem(new, newkey, value);
12493 Py_DECREF(newkey);
12494 if (res < 0)
12495 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012496 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012497 /* just keep integer keys */
12498 if (PyDict_SetItem(new, key, value) < 0)
12499 goto err;
12500 } else {
12501 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12502 "be strings or integers");
12503 goto err;
12504 }
12505 }
12506 }
12507 return new;
12508 err:
12509 Py_DECREF(new);
12510 return NULL;
12511}
12512
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012513PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012514 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012515\n\
12516Return a copy of the string S, where all characters have been mapped\n\
12517through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012518Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012519Unmapped characters are left untouched. Characters mapped to None\n\
12520are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012521
12522static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012523unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012524{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012525 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012526}
12527
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012528PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012529 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012530\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012531Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012532
12533static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012534unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012535{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012536 return fixup(self, fixupper);
12537}
12538
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012539PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012540 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012541\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012542Pad a numeric string S with zeros on the left, to fill a field\n\
12543of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012544
12545static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012546unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012547{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012548 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012549 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012550 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012551 int kind;
12552 void *data;
12553 Py_UCS4 chr;
12554
Martin v. Löwis18e16552006-02-15 17:27:45 +000012555 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012556 return NULL;
12557
Victor Stinnerc4b49542011-12-11 22:44:26 +010012558 if (PyUnicode_READY(self) < 0)
12559 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012560
Victor Stinnerc4b49542011-12-11 22:44:26 +010012561 if (PyUnicode_GET_LENGTH(self) >= width)
12562 return unicode_result_unchanged(self);
12563
12564 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012565
12566 u = pad(self, fill, 0, '0');
12567
Walter Dörwald068325e2002-04-15 13:36:47 +000012568 if (u == NULL)
12569 return NULL;
12570
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012571 kind = PyUnicode_KIND(u);
12572 data = PyUnicode_DATA(u);
12573 chr = PyUnicode_READ(kind, data, fill);
12574
12575 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012576 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012577 PyUnicode_WRITE(kind, data, 0, chr);
12578 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012579 }
12580
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012581 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012582 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012583}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012584
12585#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012586static PyObject *
12587unicode__decimal2ascii(PyObject *self)
12588{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012589 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012590}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012591#endif
12592
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012593PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012594 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012595\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012596Return True if S starts with the specified prefix, False otherwise.\n\
12597With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012598With optional end, stop comparing S at that position.\n\
12599prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012600
12601static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012602unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012603 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012604{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012605 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012606 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012607 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012608 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012609 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012610
Jesus Ceaac451502011-04-20 17:09:23 +020012611 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012612 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012613 if (PyTuple_Check(subobj)) {
12614 Py_ssize_t i;
12615 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012616 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012617 if (substring == NULL)
12618 return NULL;
12619 result = tailmatch(self, substring, start, end, -1);
12620 Py_DECREF(substring);
12621 if (result) {
12622 Py_RETURN_TRUE;
12623 }
12624 }
12625 /* nothing matched */
12626 Py_RETURN_FALSE;
12627 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012628 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012629 if (substring == NULL) {
12630 if (PyErr_ExceptionMatches(PyExc_TypeError))
12631 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12632 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012633 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012634 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012635 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012636 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012637 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012638}
12639
12640
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012641PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012642 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012643\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012644Return True if S ends with the specified suffix, False otherwise.\n\
12645With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012646With optional end, stop comparing S at that position.\n\
12647suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012648
12649static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012650unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012651 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012652{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012653 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012654 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012655 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012656 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012657 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012658
Jesus Ceaac451502011-04-20 17:09:23 +020012659 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012660 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012661 if (PyTuple_Check(subobj)) {
12662 Py_ssize_t i;
12663 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012664 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012665 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012666 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012667 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012668 result = tailmatch(self, substring, start, end, +1);
12669 Py_DECREF(substring);
12670 if (result) {
12671 Py_RETURN_TRUE;
12672 }
12673 }
12674 Py_RETURN_FALSE;
12675 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012676 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012677 if (substring == NULL) {
12678 if (PyErr_ExceptionMatches(PyExc_TypeError))
12679 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12680 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012681 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012682 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012683 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012684 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012685 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012686}
12687
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012688#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012689
12690PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012691 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012692\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012693Return a formatted version of S, using substitutions from args and kwargs.\n\
12694The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012695
Eric Smith27bbca62010-11-04 17:06:58 +000012696PyDoc_STRVAR(format_map__doc__,
12697 "S.format_map(mapping) -> str\n\
12698\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012699Return a formatted version of S, using substitutions from mapping.\n\
12700The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012701
Eric Smith4a7d76d2008-05-30 18:10:19 +000012702static PyObject *
12703unicode__format__(PyObject* self, PyObject* args)
12704{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012705 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012706
12707 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12708 return NULL;
12709
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012710 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012711 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012712 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012713}
12714
Eric Smith8c663262007-08-25 02:26:07 +000012715PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012716 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012717\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012718Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012719
12720static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012721unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012722{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012723 Py_ssize_t size;
12724
12725 /* If it's a compact object, account for base structure +
12726 character data. */
12727 if (PyUnicode_IS_COMPACT_ASCII(v))
12728 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12729 else if (PyUnicode_IS_COMPACT(v))
12730 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012731 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012732 else {
12733 /* If it is a two-block object, account for base object, and
12734 for character block if present. */
12735 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012736 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012737 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012738 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012739 }
12740 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012741 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012742 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012743 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012744 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012745 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012746
12747 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012748}
12749
12750PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012751 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012752
12753static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012754unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012755{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012756 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012757 if (!copy)
12758 return NULL;
12759 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012760}
12761
Guido van Rossumd57fd912000-03-10 22:53:23 +000012762static PyMethodDef unicode_methods[] = {
12763
12764 /* Order is according to common usage: often used methods should
12765 appear first, since lookup is done sequentially. */
12766
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012767 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012768 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12769 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012770 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012771 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12772 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12773 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12774 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12775 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12776 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12777 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012778 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012779 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12780 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12781 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012782 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012783 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12784 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12785 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012786 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012787 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012788 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012789 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012790 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12791 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12792 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12793 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12794 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12795 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12796 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12797 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12798 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12799 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12800 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12801 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12802 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12803 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012804 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012805 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012806 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012807 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012808 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012809 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012810 {"maketrans", (PyCFunction) unicode_maketrans,
12811 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012812 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012813#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012814 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012815#endif
12816
12817#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012818 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012819 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012820#endif
12821
Benjamin Peterson14339b62009-01-31 16:36:08 +000012822 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012823 {NULL, NULL}
12824};
12825
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012826static PyObject *
12827unicode_mod(PyObject *v, PyObject *w)
12828{
Brian Curtindfc80e32011-08-10 20:28:54 -050012829 if (!PyUnicode_Check(v))
12830 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012831 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012832}
12833
12834static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012835 0, /*nb_add*/
12836 0, /*nb_subtract*/
12837 0, /*nb_multiply*/
12838 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012839};
12840
Guido van Rossumd57fd912000-03-10 22:53:23 +000012841static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012842 (lenfunc) unicode_length, /* sq_length */
12843 PyUnicode_Concat, /* sq_concat */
12844 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12845 (ssizeargfunc) unicode_getitem, /* sq_item */
12846 0, /* sq_slice */
12847 0, /* sq_ass_item */
12848 0, /* sq_ass_slice */
12849 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012850};
12851
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012852static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012853unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012854{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012855 if (PyUnicode_READY(self) == -1)
12856 return NULL;
12857
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000012858 if (PyIndex_Check(item)) {
12859 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012860 if (i == -1 && PyErr_Occurred())
12861 return NULL;
12862 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012863 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012864 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012865 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000012866 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012867 PyObject *result;
12868 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012869 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012870 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012871
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012872 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000012873 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012874 return NULL;
12875 }
12876
12877 if (slicelength <= 0) {
Victor Stinner382955f2011-12-11 21:44:00 +010012878 Py_INCREF(unicode_empty);
12879 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012880 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010012881 slicelength == PyUnicode_GET_LENGTH(self)) {
12882 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000012883 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012884 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020012885 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012886 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012887 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012888 src_kind = PyUnicode_KIND(self);
12889 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020012890 if (!PyUnicode_IS_ASCII(self)) {
12891 kind_limit = kind_maxchar_limit(src_kind);
12892 max_char = 0;
12893 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12894 ch = PyUnicode_READ(src_kind, src_data, cur);
12895 if (ch > max_char) {
12896 max_char = ch;
12897 if (max_char >= kind_limit)
12898 break;
12899 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012900 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012901 }
Victor Stinner55c99112011-10-13 01:17:06 +020012902 else
12903 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012904 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012905 if (result == NULL)
12906 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012907 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012908 dest_data = PyUnicode_DATA(result);
12909
12910 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012911 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
12912 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012913 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012914 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012915 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012916 } else {
12917 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12918 return NULL;
12919 }
12920}
12921
12922static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012923 (lenfunc)unicode_length, /* mp_length */
12924 (binaryfunc)unicode_subscript, /* mp_subscript */
12925 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012926};
12927
Guido van Rossumd57fd912000-03-10 22:53:23 +000012928
Guido van Rossumd57fd912000-03-10 22:53:23 +000012929/* Helpers for PyUnicode_Format() */
12930
12931static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000012932getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012933{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012934 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012935 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012936 (*p_argidx)++;
12937 if (arglen < 0)
12938 return args;
12939 else
12940 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012941 }
12942 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012943 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012944 return NULL;
12945}
12946
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012947/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012948
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012949static PyObject *
12950formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012951{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012952 char *p;
12953 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012954 double x;
Tim Petersced69f82003-09-16 20:30:58 +000012955
Guido van Rossumd57fd912000-03-10 22:53:23 +000012956 x = PyFloat_AsDouble(v);
12957 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012958 return NULL;
12959
Guido van Rossumd57fd912000-03-10 22:53:23 +000012960 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012961 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000012962
Eric Smith0923d1d2009-04-16 20:16:10 +000012963 p = PyOS_double_to_string(x, type, prec,
12964 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012965 if (p == NULL)
12966 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012967 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000012968 PyMem_Free(p);
12969 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012970}
12971
Tim Peters38fd5b62000-09-21 05:43:11 +000012972static PyObject*
12973formatlong(PyObject *val, int flags, int prec, int type)
12974{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012975 char *buf;
12976 int len;
12977 PyObject *str; /* temporary string object. */
12978 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012979
Benjamin Peterson14339b62009-01-31 16:36:08 +000012980 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
12981 if (!str)
12982 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012983 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012984 Py_DECREF(str);
12985 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012986}
12987
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012988static Py_UCS4
12989formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012990{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012991 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012992 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012993 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012994 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000012995 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012996 goto onError;
12997 }
12998 else {
12999 /* Integer input truncated to a character */
13000 long x;
13001 x = PyLong_AsLong(v);
13002 if (x == -1 && PyErr_Occurred())
13003 goto onError;
13004
Victor Stinner8faf8212011-12-08 22:14:11 +010013005 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013006 PyErr_SetString(PyExc_OverflowError,
13007 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013008 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013009 }
13010
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013011 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013012 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013013
Benjamin Peterson29060642009-01-31 22:14:21 +000013014 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013015 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013016 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013017 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013018}
13019
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013020static int
13021repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count)
13022{
13023 int r;
13024 assert(count > 0);
13025 assert(PyUnicode_Check(obj));
13026 if (count > 5) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013027 PyObject *repeated = unicode_repeat(obj, count);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013028 if (repeated == NULL)
13029 return -1;
13030 r = _PyAccu_Accumulate(acc, repeated);
13031 Py_DECREF(repeated);
13032 return r;
13033 }
13034 else {
13035 do {
13036 if (_PyAccu_Accumulate(acc, obj))
13037 return -1;
13038 } while (--count);
13039 return 0;
13040 }
13041}
13042
Alexander Belopolsky40018472011-02-26 01:02:56 +000013043PyObject *
13044PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013045{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013046 void *fmt;
13047 int fmtkind;
13048 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013049 int kind;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013050 int r;
13051 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013052 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013053 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013054 PyObject *temp = NULL;
13055 PyObject *second = NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013056 PyObject *uformat;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013057 _PyAccu acc;
13058 static PyObject *plus, *minus, *blank, *zero, *percent;
13059
13060 if (!plus && !(plus = get_latin1_char('+')))
13061 return NULL;
13062 if (!minus && !(minus = get_latin1_char('-')))
13063 return NULL;
13064 if (!blank && !(blank = get_latin1_char(' ')))
13065 return NULL;
13066 if (!zero && !(zero = get_latin1_char('0')))
13067 return NULL;
13068 if (!percent && !(percent = get_latin1_char('%')))
13069 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000013070
Guido van Rossumd57fd912000-03-10 22:53:23 +000013071 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013072 PyErr_BadInternalCall();
13073 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013074 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013075 uformat = PyUnicode_FromObject(format);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013076 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013077 return NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013078 if (_PyAccu_Init(&acc))
13079 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013080 fmt = PyUnicode_DATA(uformat);
13081 fmtkind = PyUnicode_KIND(uformat);
13082 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13083 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013084
Guido van Rossumd57fd912000-03-10 22:53:23 +000013085 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013086 arglen = PyTuple_Size(args);
13087 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013088 }
13089 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013090 arglen = -1;
13091 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013092 }
Christian Heimes90aa7642007-12-19 02:45:37 +000013093 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000013094 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013095 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013096
13097 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013098 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013099 PyObject *nonfmt;
13100 Py_ssize_t nonfmtpos;
13101 nonfmtpos = fmtpos++;
13102 while (fmtcnt >= 0 &&
13103 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13104 fmtpos++;
13105 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013106 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013107 nonfmt = PyUnicode_Substring(uformat, nonfmtpos, fmtpos);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013108 if (nonfmt == NULL)
13109 goto onError;
13110 r = _PyAccu_Accumulate(&acc, nonfmt);
13111 Py_DECREF(nonfmt);
13112 if (r)
13113 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013114 }
13115 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013116 /* Got a format specifier */
13117 int flags = 0;
13118 Py_ssize_t width = -1;
13119 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013120 Py_UCS4 c = '\0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013121 Py_UCS4 fill, sign;
Benjamin Peterson29060642009-01-31 22:14:21 +000013122 int isnumok;
13123 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013124 void *pbuf = NULL;
13125 Py_ssize_t pindex, len;
13126 PyObject *signobj = NULL, *fillobj = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013127
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013128 fmtpos++;
13129 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
13130 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013131 Py_ssize_t keylen;
13132 PyObject *key;
13133 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013134
Benjamin Peterson29060642009-01-31 22:14:21 +000013135 if (dict == NULL) {
13136 PyErr_SetString(PyExc_TypeError,
13137 "format requires a mapping");
13138 goto onError;
13139 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013140 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013141 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013142 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013143 /* Skip over balanced parentheses */
13144 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013145 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013146 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013147 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013148 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013149 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013150 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013151 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013152 if (fmtcnt < 0 || pcount > 0) {
13153 PyErr_SetString(PyExc_ValueError,
13154 "incomplete format key");
13155 goto onError;
13156 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013157 key = PyUnicode_Substring(uformat,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013158 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013159 if (key == NULL)
13160 goto onError;
13161 if (args_owned) {
13162 Py_DECREF(args);
13163 args_owned = 0;
13164 }
13165 args = PyObject_GetItem(dict, key);
13166 Py_DECREF(key);
13167 if (args == NULL) {
13168 goto onError;
13169 }
13170 args_owned = 1;
13171 arglen = -1;
13172 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013173 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013174 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013175 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013176 case '-': flags |= F_LJUST; continue;
13177 case '+': flags |= F_SIGN; continue;
13178 case ' ': flags |= F_BLANK; continue;
13179 case '#': flags |= F_ALT; continue;
13180 case '0': flags |= F_ZERO; continue;
13181 }
13182 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013183 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013184 if (c == '*') {
13185 v = getnextarg(args, arglen, &argidx);
13186 if (v == NULL)
13187 goto onError;
13188 if (!PyLong_Check(v)) {
13189 PyErr_SetString(PyExc_TypeError,
13190 "* wants int");
13191 goto onError;
13192 }
13193 width = PyLong_AsLong(v);
13194 if (width == -1 && PyErr_Occurred())
13195 goto onError;
13196 if (width < 0) {
13197 flags |= F_LJUST;
13198 width = -width;
13199 }
13200 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013201 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013202 }
13203 else if (c >= '0' && c <= '9') {
13204 width = c - '0';
13205 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013206 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013207 if (c < '0' || c > '9')
13208 break;
13209 if ((width*10) / 10 != width) {
13210 PyErr_SetString(PyExc_ValueError,
13211 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013212 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013213 }
13214 width = width*10 + (c - '0');
13215 }
13216 }
13217 if (c == '.') {
13218 prec = 0;
13219 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013220 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013221 if (c == '*') {
13222 v = getnextarg(args, arglen, &argidx);
13223 if (v == NULL)
13224 goto onError;
13225 if (!PyLong_Check(v)) {
13226 PyErr_SetString(PyExc_TypeError,
13227 "* wants int");
13228 goto onError;
13229 }
13230 prec = PyLong_AsLong(v);
13231 if (prec == -1 && PyErr_Occurred())
13232 goto onError;
13233 if (prec < 0)
13234 prec = 0;
13235 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013236 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013237 }
13238 else if (c >= '0' && c <= '9') {
13239 prec = c - '0';
13240 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013241 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013242 if (c < '0' || c > '9')
13243 break;
13244 if ((prec*10) / 10 != prec) {
13245 PyErr_SetString(PyExc_ValueError,
13246 "prec too big");
13247 goto onError;
13248 }
13249 prec = prec*10 + (c - '0');
13250 }
13251 }
13252 } /* prec */
13253 if (fmtcnt >= 0) {
13254 if (c == 'h' || c == 'l' || c == 'L') {
13255 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013256 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013257 }
13258 }
13259 if (fmtcnt < 0) {
13260 PyErr_SetString(PyExc_ValueError,
13261 "incomplete format");
13262 goto onError;
13263 }
13264 if (c != '%') {
13265 v = getnextarg(args, arglen, &argidx);
13266 if (v == NULL)
13267 goto onError;
13268 }
13269 sign = 0;
13270 fill = ' ';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013271 fillobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013272 switch (c) {
13273
13274 case '%':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013275 _PyAccu_Accumulate(&acc, percent);
13276 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013277
13278 case 's':
13279 case 'r':
13280 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000013281 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013282 temp = v;
13283 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013284 }
13285 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013286 if (c == 's')
13287 temp = PyObject_Str(v);
13288 else if (c == 'r')
13289 temp = PyObject_Repr(v);
13290 else
13291 temp = PyObject_ASCII(v);
13292 if (temp == NULL)
13293 goto onError;
13294 if (PyUnicode_Check(temp))
13295 /* nothing to do */;
13296 else {
13297 Py_DECREF(temp);
13298 PyErr_SetString(PyExc_TypeError,
13299 "%s argument has non-string str()");
13300 goto onError;
13301 }
13302 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013303 if (PyUnicode_READY(temp) == -1) {
13304 Py_CLEAR(temp);
13305 goto onError;
13306 }
13307 pbuf = PyUnicode_DATA(temp);
13308 kind = PyUnicode_KIND(temp);
13309 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013310 if (prec >= 0 && len > prec)
13311 len = prec;
13312 break;
13313
13314 case 'i':
13315 case 'd':
13316 case 'u':
13317 case 'o':
13318 case 'x':
13319 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000013320 isnumok = 0;
13321 if (PyNumber_Check(v)) {
13322 PyObject *iobj=NULL;
13323
13324 if (PyLong_Check(v)) {
13325 iobj = v;
13326 Py_INCREF(iobj);
13327 }
13328 else {
13329 iobj = PyNumber_Long(v);
13330 }
13331 if (iobj!=NULL) {
13332 if (PyLong_Check(iobj)) {
13333 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013334 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013335 Py_DECREF(iobj);
13336 if (!temp)
13337 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013338 if (PyUnicode_READY(temp) == -1) {
13339 Py_CLEAR(temp);
13340 goto onError;
13341 }
13342 pbuf = PyUnicode_DATA(temp);
13343 kind = PyUnicode_KIND(temp);
13344 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013345 sign = 1;
13346 }
13347 else {
13348 Py_DECREF(iobj);
13349 }
13350 }
13351 }
13352 if (!isnumok) {
13353 PyErr_Format(PyExc_TypeError,
13354 "%%%c format: a number is required, "
13355 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13356 goto onError;
13357 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013358 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013359 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013360 fillobj = zero;
13361 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013362 break;
13363
13364 case 'e':
13365 case 'E':
13366 case 'f':
13367 case 'F':
13368 case 'g':
13369 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013370 temp = formatfloat(v, flags, prec, c);
13371 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000013372 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013373 if (PyUnicode_READY(temp) == -1) {
13374 Py_CLEAR(temp);
13375 goto onError;
13376 }
13377 pbuf = PyUnicode_DATA(temp);
13378 kind = PyUnicode_KIND(temp);
13379 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013380 sign = 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013381 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013382 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013383 fillobj = zero;
13384 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013385 break;
13386
13387 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013388 {
13389 Py_UCS4 ch = formatchar(v);
13390 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013391 goto onError;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013392 temp = _PyUnicode_FromUCS4(&ch, 1);
13393 if (temp == NULL)
13394 goto onError;
13395 pbuf = PyUnicode_DATA(temp);
13396 kind = PyUnicode_KIND(temp);
13397 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013398 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013399 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013400
13401 default:
13402 PyErr_Format(PyExc_ValueError,
13403 "unsupported format character '%c' (0x%x) "
13404 "at index %zd",
13405 (31<=c && c<=126) ? (char)c : '?',
13406 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013407 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013408 goto onError;
13409 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013410 /* pbuf is initialized here. */
13411 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013412 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013413 if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
13414 signobj = minus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013415 len--;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013416 pindex++;
13417 }
13418 else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
13419 signobj = plus;
13420 len--;
13421 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013422 }
13423 else if (flags & F_SIGN)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013424 signobj = plus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013425 else if (flags & F_BLANK)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013426 signobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013427 else
13428 sign = 0;
13429 }
13430 if (width < len)
13431 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013432 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013433 if (fill != ' ') {
13434 assert(signobj != NULL);
13435 if (_PyAccu_Accumulate(&acc, signobj))
13436 goto onError;
13437 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013438 if (width > len)
13439 width--;
13440 }
13441 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013442 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013443 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013444 if (fill != ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013445 second = get_latin1_char(
13446 PyUnicode_READ(kind, pbuf, pindex + 1));
13447 pindex += 2;
13448 if (second == NULL ||
13449 _PyAccu_Accumulate(&acc, zero) ||
13450 _PyAccu_Accumulate(&acc, second))
13451 goto onError;
13452 Py_CLEAR(second);
Benjamin Peterson29060642009-01-31 22:14:21 +000013453 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013454 width -= 2;
13455 if (width < 0)
13456 width = 0;
13457 len -= 2;
13458 }
13459 if (width > len && !(flags & F_LJUST)) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013460 assert(fillobj != NULL);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013461 if (repeat_accumulate(&acc, fillobj, width - len))
13462 goto onError;
13463 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013464 }
13465 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013466 if (sign) {
13467 assert(signobj != NULL);
13468 if (_PyAccu_Accumulate(&acc, signobj))
13469 goto onError;
13470 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013471 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013472 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13473 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013474 second = get_latin1_char(
13475 PyUnicode_READ(kind, pbuf, pindex + 1));
13476 pindex += 2;
13477 if (second == NULL ||
13478 _PyAccu_Accumulate(&acc, zero) ||
13479 _PyAccu_Accumulate(&acc, second))
13480 goto onError;
13481 Py_CLEAR(second);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013482 }
13483 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013484 /* Copy all characters, preserving len */
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013485 if (temp != NULL) {
13486 assert(pbuf == PyUnicode_DATA(temp));
13487 v = PyUnicode_Substring(temp, pindex, pindex + len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013488 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013489 else {
13490 const char *p = (const char *) pbuf;
13491 assert(pbuf != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013492 p += kind * pindex;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013493 v = PyUnicode_FromKindAndData(kind, p, len);
13494 }
13495 if (v == NULL)
13496 goto onError;
13497 r = _PyAccu_Accumulate(&acc, v);
13498 Py_DECREF(v);
13499 if (r)
13500 goto onError;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013501 if (width > len && repeat_accumulate(&acc, blank, width - len))
13502 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013503 if (dict && (argidx < arglen) && c != '%') {
13504 PyErr_SetString(PyExc_TypeError,
13505 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013506 goto onError;
13507 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013508 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013509 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013510 } /* until end */
13511 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013512 PyErr_SetString(PyExc_TypeError,
13513 "not all arguments converted during string formatting");
13514 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013515 }
13516
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013517 result = _PyAccu_Finish(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013518 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013519 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013520 }
13521 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013522 Py_XDECREF(temp);
13523 Py_XDECREF(second);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013524 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013525
Benjamin Peterson29060642009-01-31 22:14:21 +000013526 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013527 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013528 Py_XDECREF(temp);
13529 Py_XDECREF(second);
13530 _PyAccu_Destroy(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013531 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013532 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013533 }
13534 return NULL;
13535}
13536
Jeremy Hylton938ace62002-07-17 16:30:39 +000013537static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013538unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13539
Tim Peters6d6c1a32001-08-02 04:15:00 +000013540static PyObject *
13541unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13542{
Benjamin Peterson29060642009-01-31 22:14:21 +000013543 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013544 static char *kwlist[] = {"object", "encoding", "errors", 0};
13545 char *encoding = NULL;
13546 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013547
Benjamin Peterson14339b62009-01-31 16:36:08 +000013548 if (type != &PyUnicode_Type)
13549 return unicode_subtype_new(type, args, kwds);
13550 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013551 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013552 return NULL;
Victor Stinner382955f2011-12-11 21:44:00 +010013553 if (x == NULL) {
13554 Py_INCREF(unicode_empty);
13555 return unicode_empty;
13556 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013557 if (encoding == NULL && errors == NULL)
13558 return PyObject_Str(x);
13559 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013560 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013561}
13562
Guido van Rossume023fe02001-08-30 03:12:59 +000013563static PyObject *
13564unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13565{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013566 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013567 Py_ssize_t length, char_size;
13568 int share_wstr, share_utf8;
13569 unsigned int kind;
13570 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013571
Benjamin Peterson14339b62009-01-31 16:36:08 +000013572 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013573
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013574 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013575 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013576 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013577 assert(_PyUnicode_CHECK(unicode));
Victor Stinnere06e1452011-10-04 20:52:31 +020013578 if (PyUnicode_READY(unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013579 return NULL;
13580
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013581 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013582 if (self == NULL) {
13583 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013584 return NULL;
13585 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013586 kind = PyUnicode_KIND(unicode);
13587 length = PyUnicode_GET_LENGTH(unicode);
13588
13589 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013590#ifdef Py_DEBUG
13591 _PyUnicode_HASH(self) = -1;
13592#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013593 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013594#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013595 _PyUnicode_STATE(self).interned = 0;
13596 _PyUnicode_STATE(self).kind = kind;
13597 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013598 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013599 _PyUnicode_STATE(self).ready = 1;
13600 _PyUnicode_WSTR(self) = NULL;
13601 _PyUnicode_UTF8_LENGTH(self) = 0;
13602 _PyUnicode_UTF8(self) = NULL;
13603 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013604 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013605
13606 share_utf8 = 0;
13607 share_wstr = 0;
13608 if (kind == PyUnicode_1BYTE_KIND) {
13609 char_size = 1;
13610 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13611 share_utf8 = 1;
13612 }
13613 else if (kind == PyUnicode_2BYTE_KIND) {
13614 char_size = 2;
13615 if (sizeof(wchar_t) == 2)
13616 share_wstr = 1;
13617 }
13618 else {
13619 assert(kind == PyUnicode_4BYTE_KIND);
13620 char_size = 4;
13621 if (sizeof(wchar_t) == 4)
13622 share_wstr = 1;
13623 }
13624
13625 /* Ensure we won't overflow the length. */
13626 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13627 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013628 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013629 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013630 data = PyObject_MALLOC((length + 1) * char_size);
13631 if (data == NULL) {
13632 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013633 goto onError;
13634 }
13635
Victor Stinnerc3c74152011-10-02 20:39:55 +020013636 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013637 if (share_utf8) {
13638 _PyUnicode_UTF8_LENGTH(self) = length;
13639 _PyUnicode_UTF8(self) = data;
13640 }
13641 if (share_wstr) {
13642 _PyUnicode_WSTR_LENGTH(self) = length;
13643 _PyUnicode_WSTR(self) = (wchar_t *)data;
13644 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013645
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013646 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013647 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013648 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013649#ifdef Py_DEBUG
13650 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13651#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020013652 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013653 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013654
13655onError:
13656 Py_DECREF(unicode);
13657 Py_DECREF(self);
13658 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013659}
13660
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013661PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013662 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013663\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013664Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013665encoding defaults to the current default string encoding.\n\
13666errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013667
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013668static PyObject *unicode_iter(PyObject *seq);
13669
Guido van Rossumd57fd912000-03-10 22:53:23 +000013670PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013671 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013672 "str", /* tp_name */
13673 sizeof(PyUnicodeObject), /* tp_size */
13674 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013675 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013676 (destructor)unicode_dealloc, /* tp_dealloc */
13677 0, /* tp_print */
13678 0, /* tp_getattr */
13679 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013680 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013681 unicode_repr, /* tp_repr */
13682 &unicode_as_number, /* tp_as_number */
13683 &unicode_as_sequence, /* tp_as_sequence */
13684 &unicode_as_mapping, /* tp_as_mapping */
13685 (hashfunc) unicode_hash, /* tp_hash*/
13686 0, /* tp_call*/
13687 (reprfunc) unicode_str, /* tp_str */
13688 PyObject_GenericGetAttr, /* tp_getattro */
13689 0, /* tp_setattro */
13690 0, /* tp_as_buffer */
13691 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013692 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013693 unicode_doc, /* tp_doc */
13694 0, /* tp_traverse */
13695 0, /* tp_clear */
13696 PyUnicode_RichCompare, /* tp_richcompare */
13697 0, /* tp_weaklistoffset */
13698 unicode_iter, /* tp_iter */
13699 0, /* tp_iternext */
13700 unicode_methods, /* tp_methods */
13701 0, /* tp_members */
13702 0, /* tp_getset */
13703 &PyBaseObject_Type, /* tp_base */
13704 0, /* tp_dict */
13705 0, /* tp_descr_get */
13706 0, /* tp_descr_set */
13707 0, /* tp_dictoffset */
13708 0, /* tp_init */
13709 0, /* tp_alloc */
13710 unicode_new, /* tp_new */
13711 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013712};
13713
13714/* Initialize the Unicode implementation */
13715
Victor Stinner3a50e702011-10-18 21:21:00 +020013716int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013717{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013718 int i;
13719
Thomas Wouters477c8d52006-05-27 19:21:47 +000013720 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013721 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013722 0x000A, /* LINE FEED */
13723 0x000D, /* CARRIAGE RETURN */
13724 0x001C, /* FILE SEPARATOR */
13725 0x001D, /* GROUP SEPARATOR */
13726 0x001E, /* RECORD SEPARATOR */
13727 0x0085, /* NEXT LINE */
13728 0x2028, /* LINE SEPARATOR */
13729 0x2029, /* PARAGRAPH SEPARATOR */
13730 };
13731
Fred Drakee4315f52000-05-09 19:53:39 +000013732 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013733 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013734 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013735 Py_FatalError("Can't create empty string");
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010013736 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013737
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013738 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013739 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013740 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013741 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013742
13743 /* initialize the linebreak bloom filter */
13744 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013745 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013746 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013747
13748 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020013749
13750#ifdef HAVE_MBCS
13751 winver.dwOSVersionInfoSize = sizeof(winver);
13752 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
13753 PyErr_SetFromWindowsErr(0);
13754 return -1;
13755 }
13756#endif
13757 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013758}
13759
13760/* Finalize the Unicode implementation */
13761
Christian Heimesa156e092008-02-16 07:38:31 +000013762int
13763PyUnicode_ClearFreeList(void)
13764{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013765 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013766}
13767
Guido van Rossumd57fd912000-03-10 22:53:23 +000013768void
Thomas Wouters78890102000-07-22 19:25:51 +000013769_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013770{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013771 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013772
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013773 Py_XDECREF(unicode_empty);
13774 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013775
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013776 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013777 if (unicode_latin1[i]) {
13778 Py_DECREF(unicode_latin1[i]);
13779 unicode_latin1[i] = NULL;
13780 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013781 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020013782 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000013783 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013784}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013785
Walter Dörwald16807132007-05-25 13:52:07 +000013786void
13787PyUnicode_InternInPlace(PyObject **p)
13788{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013789 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013790 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013791#ifdef Py_DEBUG
13792 assert(s != NULL);
13793 assert(_PyUnicode_CHECK(s));
13794#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013795 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013796 return;
13797#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013798 /* If it's a subclass, we don't really know what putting
13799 it in the interned dict might do. */
13800 if (!PyUnicode_CheckExact(s))
13801 return;
13802 if (PyUnicode_CHECK_INTERNED(s))
13803 return;
13804 if (interned == NULL) {
13805 interned = PyDict_New();
13806 if (interned == NULL) {
13807 PyErr_Clear(); /* Don't leave an exception */
13808 return;
13809 }
13810 }
13811 /* It might be that the GetItem call fails even
13812 though the key is present in the dictionary,
13813 namely when this happens during a stack overflow. */
13814 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010013815 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013816 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000013817
Benjamin Peterson29060642009-01-31 22:14:21 +000013818 if (t) {
13819 Py_INCREF(t);
13820 Py_DECREF(*p);
13821 *p = t;
13822 return;
13823 }
Walter Dörwald16807132007-05-25 13:52:07 +000013824
Benjamin Peterson14339b62009-01-31 16:36:08 +000013825 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010013826 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013827 PyErr_Clear();
13828 PyThreadState_GET()->recursion_critical = 0;
13829 return;
13830 }
13831 PyThreadState_GET()->recursion_critical = 0;
13832 /* The two references in interned are not counted by refcnt.
13833 The deallocator will take care of this */
13834 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013835 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013836}
13837
13838void
13839PyUnicode_InternImmortal(PyObject **p)
13840{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013841 PyUnicode_InternInPlace(p);
13842 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020013843 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013844 Py_INCREF(*p);
13845 }
Walter Dörwald16807132007-05-25 13:52:07 +000013846}
13847
13848PyObject *
13849PyUnicode_InternFromString(const char *cp)
13850{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013851 PyObject *s = PyUnicode_FromString(cp);
13852 if (s == NULL)
13853 return NULL;
13854 PyUnicode_InternInPlace(&s);
13855 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000013856}
13857
Alexander Belopolsky40018472011-02-26 01:02:56 +000013858void
13859_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000013860{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013861 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013862 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013863 Py_ssize_t i, n;
13864 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000013865
Benjamin Peterson14339b62009-01-31 16:36:08 +000013866 if (interned == NULL || !PyDict_Check(interned))
13867 return;
13868 keys = PyDict_Keys(interned);
13869 if (keys == NULL || !PyList_Check(keys)) {
13870 PyErr_Clear();
13871 return;
13872 }
Walter Dörwald16807132007-05-25 13:52:07 +000013873
Benjamin Peterson14339b62009-01-31 16:36:08 +000013874 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13875 detector, interned unicode strings are not forcibly deallocated;
13876 rather, we give them their stolen references back, and then clear
13877 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000013878
Benjamin Peterson14339b62009-01-31 16:36:08 +000013879 n = PyList_GET_SIZE(keys);
13880 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000013881 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013882 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013883 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013884 if (PyUnicode_READY(s) == -1) {
13885 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013886 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013887 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013888 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013889 case SSTATE_NOT_INTERNED:
13890 /* XXX Shouldn't happen */
13891 break;
13892 case SSTATE_INTERNED_IMMORTAL:
13893 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013894 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013895 break;
13896 case SSTATE_INTERNED_MORTAL:
13897 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013898 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013899 break;
13900 default:
13901 Py_FatalError("Inconsistent interned string state.");
13902 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013903 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013904 }
13905 fprintf(stderr, "total size of all interned strings: "
13906 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13907 "mortal/immortal\n", mortal_size, immortal_size);
13908 Py_DECREF(keys);
13909 PyDict_Clear(interned);
13910 Py_DECREF(interned);
13911 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000013912}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013913
13914
13915/********************* Unicode Iterator **************************/
13916
13917typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013918 PyObject_HEAD
13919 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013920 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013921} unicodeiterobject;
13922
13923static void
13924unicodeiter_dealloc(unicodeiterobject *it)
13925{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013926 _PyObject_GC_UNTRACK(it);
13927 Py_XDECREF(it->it_seq);
13928 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013929}
13930
13931static int
13932unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13933{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013934 Py_VISIT(it->it_seq);
13935 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013936}
13937
13938static PyObject *
13939unicodeiter_next(unicodeiterobject *it)
13940{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013941 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013942
Benjamin Peterson14339b62009-01-31 16:36:08 +000013943 assert(it != NULL);
13944 seq = it->it_seq;
13945 if (seq == NULL)
13946 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013947 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013948
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013949 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
13950 int kind = PyUnicode_KIND(seq);
13951 void *data = PyUnicode_DATA(seq);
13952 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
13953 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013954 if (item != NULL)
13955 ++it->it_index;
13956 return item;
13957 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013958
Benjamin Peterson14339b62009-01-31 16:36:08 +000013959 Py_DECREF(seq);
13960 it->it_seq = NULL;
13961 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013962}
13963
13964static PyObject *
13965unicodeiter_len(unicodeiterobject *it)
13966{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013967 Py_ssize_t len = 0;
13968 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013969 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013970 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013971}
13972
13973PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
13974
13975static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013976 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000013977 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000013978 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013979};
13980
13981PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013982 PyVarObject_HEAD_INIT(&PyType_Type, 0)
13983 "str_iterator", /* tp_name */
13984 sizeof(unicodeiterobject), /* tp_basicsize */
13985 0, /* tp_itemsize */
13986 /* methods */
13987 (destructor)unicodeiter_dealloc, /* tp_dealloc */
13988 0, /* tp_print */
13989 0, /* tp_getattr */
13990 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013991 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013992 0, /* tp_repr */
13993 0, /* tp_as_number */
13994 0, /* tp_as_sequence */
13995 0, /* tp_as_mapping */
13996 0, /* tp_hash */
13997 0, /* tp_call */
13998 0, /* tp_str */
13999 PyObject_GenericGetAttr, /* tp_getattro */
14000 0, /* tp_setattro */
14001 0, /* tp_as_buffer */
14002 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14003 0, /* tp_doc */
14004 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14005 0, /* tp_clear */
14006 0, /* tp_richcompare */
14007 0, /* tp_weaklistoffset */
14008 PyObject_SelfIter, /* tp_iter */
14009 (iternextfunc)unicodeiter_next, /* tp_iternext */
14010 unicodeiter_methods, /* tp_methods */
14011 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014012};
14013
14014static PyObject *
14015unicode_iter(PyObject *seq)
14016{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014017 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014018
Benjamin Peterson14339b62009-01-31 16:36:08 +000014019 if (!PyUnicode_Check(seq)) {
14020 PyErr_BadInternalCall();
14021 return NULL;
14022 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014023 if (PyUnicode_READY(seq) == -1)
14024 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014025 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14026 if (it == NULL)
14027 return NULL;
14028 it->it_index = 0;
14029 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014030 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014031 _PyObject_GC_TRACK(it);
14032 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014033}
14034
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014035
14036size_t
14037Py_UNICODE_strlen(const Py_UNICODE *u)
14038{
14039 int res = 0;
14040 while(*u++)
14041 res++;
14042 return res;
14043}
14044
14045Py_UNICODE*
14046Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14047{
14048 Py_UNICODE *u = s1;
14049 while ((*u++ = *s2++));
14050 return s1;
14051}
14052
14053Py_UNICODE*
14054Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14055{
14056 Py_UNICODE *u = s1;
14057 while ((*u++ = *s2++))
14058 if (n-- == 0)
14059 break;
14060 return s1;
14061}
14062
14063Py_UNICODE*
14064Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14065{
14066 Py_UNICODE *u1 = s1;
14067 u1 += Py_UNICODE_strlen(u1);
14068 Py_UNICODE_strcpy(u1, s2);
14069 return s1;
14070}
14071
14072int
14073Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14074{
14075 while (*s1 && *s2 && *s1 == *s2)
14076 s1++, s2++;
14077 if (*s1 && *s2)
14078 return (*s1 < *s2) ? -1 : +1;
14079 if (*s1)
14080 return 1;
14081 if (*s2)
14082 return -1;
14083 return 0;
14084}
14085
14086int
14087Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14088{
14089 register Py_UNICODE u1, u2;
14090 for (; n != 0; n--) {
14091 u1 = *s1;
14092 u2 = *s2;
14093 if (u1 != u2)
14094 return (u1 < u2) ? -1 : +1;
14095 if (u1 == '\0')
14096 return 0;
14097 s1++;
14098 s2++;
14099 }
14100 return 0;
14101}
14102
14103Py_UNICODE*
14104Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14105{
14106 const Py_UNICODE *p;
14107 for (p = s; *p; p++)
14108 if (*p == c)
14109 return (Py_UNICODE*)p;
14110 return NULL;
14111}
14112
14113Py_UNICODE*
14114Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14115{
14116 const Py_UNICODE *p;
14117 p = s + Py_UNICODE_strlen(s);
14118 while (p != s) {
14119 p--;
14120 if (*p == c)
14121 return (Py_UNICODE*)p;
14122 }
14123 return NULL;
14124}
Victor Stinner331ea922010-08-10 16:37:20 +000014125
Victor Stinner71133ff2010-09-01 23:43:53 +000014126Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014127PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014128{
Victor Stinner577db2c2011-10-11 22:12:48 +020014129 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014130 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014131
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014132 if (!PyUnicode_Check(unicode)) {
14133 PyErr_BadArgument();
14134 return NULL;
14135 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014136 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014137 if (u == NULL)
14138 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014139 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014140 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014141 PyErr_NoMemory();
14142 return NULL;
14143 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014144 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014145 size *= sizeof(Py_UNICODE);
14146 copy = PyMem_Malloc(size);
14147 if (copy == NULL) {
14148 PyErr_NoMemory();
14149 return NULL;
14150 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014151 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014152 return copy;
14153}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014154
Georg Brandl66c221e2010-10-14 07:04:07 +000014155/* A _string module, to export formatter_parser and formatter_field_name_split
14156 to the string.Formatter class implemented in Python. */
14157
14158static PyMethodDef _string_methods[] = {
14159 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14160 METH_O, PyDoc_STR("split the argument as a field name")},
14161 {"formatter_parser", (PyCFunction) formatter_parser,
14162 METH_O, PyDoc_STR("parse the argument as a format string")},
14163 {NULL, NULL}
14164};
14165
14166static struct PyModuleDef _string_module = {
14167 PyModuleDef_HEAD_INIT,
14168 "_string",
14169 PyDoc_STR("string helper module"),
14170 0,
14171 _string_methods,
14172 NULL,
14173 NULL,
14174 NULL,
14175 NULL
14176};
14177
14178PyMODINIT_FUNC
14179PyInit__string(void)
14180{
14181 return PyModule_Create(&_string_module);
14182}
14183
14184
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014185#ifdef __cplusplus
14186}
14187#endif