blob: 5758ffacf31aaedd9e7b766f0b3100383a0b96b8 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Guido van Rossumd57fd912000-03-10 22:53:23 +000049/* Endianness switches; defaults to little endian */
50
51#ifdef WORDS_BIGENDIAN
52# define BYTEORDER_IS_BIG_ENDIAN
53#else
54# define BYTEORDER_IS_LITTLE_ENDIAN
55#endif
56
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000057/* --- Globals ------------------------------------------------------------
58
59 The globals are initialized by the _PyUnicode_Init() API and should
60 not be used before calling that API.
61
62*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000063
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000064
65#ifdef __cplusplus
66extern "C" {
67#endif
68
Victor Stinner8faf8212011-12-08 22:14:11 +010069/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
70#define MAX_UNICODE 0x10ffff
71
Victor Stinner910337b2011-10-03 03:20:16 +020072#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020073# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020074#else
75# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
76#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020077
Victor Stinnere90fe6a2011-10-01 16:48:13 +020078#define _PyUnicode_UTF8(op) \
79 (((PyCompactUnicodeObject*)(op))->utf8)
80#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020081 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020082 assert(PyUnicode_IS_READY(op)), \
83 PyUnicode_IS_COMPACT_ASCII(op) ? \
84 ((char*)((PyASCIIObject*)(op) + 1)) : \
85 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020086#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020087 (((PyCompactUnicodeObject*)(op))->utf8_length)
88#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020089 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020090 assert(PyUnicode_IS_READY(op)), \
91 PyUnicode_IS_COMPACT_ASCII(op) ? \
92 ((PyASCIIObject*)(op))->length : \
93 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020094#define _PyUnicode_WSTR(op) \
95 (((PyASCIIObject*)(op))->wstr)
96#define _PyUnicode_WSTR_LENGTH(op) \
97 (((PyCompactUnicodeObject*)(op))->wstr_length)
98#define _PyUnicode_LENGTH(op) \
99 (((PyASCIIObject *)(op))->length)
100#define _PyUnicode_STATE(op) \
101 (((PyASCIIObject *)(op))->state)
102#define _PyUnicode_HASH(op) \
103 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200104#define _PyUnicode_KIND(op) \
105 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200107#define _PyUnicode_GET_LENGTH(op) \
108 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200109 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200110#define _PyUnicode_DATA_ANY(op) \
111 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200112
Victor Stinner910337b2011-10-03 03:20:16 +0200113#undef PyUnicode_READY
114#define PyUnicode_READY(op) \
115 (assert(_PyUnicode_CHECK(op)), \
116 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200117 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100118 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200119
Victor Stinnerc379ead2011-10-03 12:52:27 +0200120#define _PyUnicode_SHARE_UTF8(op) \
121 (assert(_PyUnicode_CHECK(op)), \
122 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
123 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
124#define _PyUnicode_SHARE_WSTR(op) \
125 (assert(_PyUnicode_CHECK(op)), \
126 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
127
Victor Stinner829c0ad2011-10-03 01:08:02 +0200128/* true if the Unicode object has an allocated UTF-8 memory block
129 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200130#define _PyUnicode_HAS_UTF8_MEMORY(op) \
131 (assert(_PyUnicode_CHECK(op)), \
132 (!PyUnicode_IS_COMPACT_ASCII(op) \
133 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200134 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
135
Victor Stinner03490912011-10-03 23:45:12 +0200136/* true if the Unicode object has an allocated wstr memory block
137 (not shared with other data) */
138#define _PyUnicode_HAS_WSTR_MEMORY(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 (_PyUnicode_WSTR(op) && \
141 (!PyUnicode_IS_READY(op) || \
142 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
143
Victor Stinner910337b2011-10-03 03:20:16 +0200144/* Generic helper macro to convert characters of different types.
145 from_type and to_type have to be valid type names, begin and end
146 are pointers to the source characters which should be of type
147 "from_type *". to is a pointer of type "to_type *" and points to the
148 buffer where the result characters are written to. */
149#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
150 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200151 to_type *_to = (to_type *) to; \
152 const from_type *_iter = (begin); \
153 const from_type *_end = (end); \
154 Py_ssize_t n = (_end) - (_iter); \
155 const from_type *_unrolled_end = \
156 _iter + (n & ~ (Py_ssize_t) 3); \
157 while (_iter < (_unrolled_end)) { \
158 _to[0] = (to_type) _iter[0]; \
159 _to[1] = (to_type) _iter[1]; \
160 _to[2] = (to_type) _iter[2]; \
161 _to[3] = (to_type) _iter[3]; \
162 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200163 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200164 while (_iter < (_end)) \
165 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200166 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200167
Walter Dörwald16807132007-05-25 13:52:07 +0000168/* This dictionary holds all interned unicode strings. Note that references
169 to strings in this dictionary are *not* counted in the string's ob_refcnt.
170 When the interned string reaches a refcnt of 0 the string deallocation
171 function will delete the reference from this dictionary.
172
173 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000174 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000175*/
176static PyObject *interned;
177
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000178/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200179static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000180
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200181/* List of static strings. */
182static _Py_Identifier *static_strings;
183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184/* Single character Unicode strings in the Latin-1 range are being
185 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200186static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000187
Christian Heimes190d79e2008-01-30 11:58:22 +0000188/* Fast detection of the most frequent whitespace characters */
189const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000190 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000191/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000192/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000193/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000194/* case 0x000C: * FORM FEED */
195/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000196 0, 1, 1, 1, 1, 1, 0, 0,
197 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000198/* case 0x001C: * FILE SEPARATOR */
199/* case 0x001D: * GROUP SEPARATOR */
200/* case 0x001E: * RECORD SEPARATOR */
201/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000202 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000203/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000204 1, 0, 0, 0, 0, 0, 0, 0,
205 0, 0, 0, 0, 0, 0, 0, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
207 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000208
Benjamin Peterson14339b62009-01-31 16:36:08 +0000209 0, 0, 0, 0, 0, 0, 0, 0,
210 0, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000217};
218
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200219/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200220static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200221static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200222static void copy_characters(
223 PyObject *to, Py_ssize_t to_start,
224 PyObject *from, Py_ssize_t from_start,
225 Py_ssize_t how_many);
Victor Stinner488fa492011-12-12 00:01:39 +0100226static int unicode_modifiable(PyObject *unicode);
227
Victor Stinnerfe226c02011-10-03 03:52:20 +0200228
Alexander Belopolsky40018472011-02-26 01:02:56 +0000229static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200230unicode_fromascii(const unsigned char *s, Py_ssize_t size);
231static PyObject *
232_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
233static PyObject *
234_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
235static PyObject *
236_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
237
238static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000239unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000240 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100241 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000242 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
243
Alexander Belopolsky40018472011-02-26 01:02:56 +0000244static void
245raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300246 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100247 PyObject *unicode,
248 Py_ssize_t startpos, Py_ssize_t endpos,
249 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000250
Christian Heimes190d79e2008-01-30 11:58:22 +0000251/* Same for linebreaks */
252static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000253 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000254/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000255/* 0x000B, * LINE TABULATION */
256/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000257/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000258 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000259 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000260/* 0x001C, * FILE SEPARATOR */
261/* 0x001D, * GROUP SEPARATOR */
262/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000263 0, 0, 0, 0, 1, 1, 1, 0,
264 0, 0, 0, 0, 0, 0, 0, 0,
265 0, 0, 0, 0, 0, 0, 0, 0,
266 0, 0, 0, 0, 0, 0, 0, 0,
267 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000268
Benjamin Peterson14339b62009-01-31 16:36:08 +0000269 0, 0, 0, 0, 0, 0, 0, 0,
270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000277};
278
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300279/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
280 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000281Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000282PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000283{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000284#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000285 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000286#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000287 /* This is actually an illegal character, so it should
288 not be passed to unichr. */
289 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000290#endif
291}
292
Victor Stinner910337b2011-10-03 03:20:16 +0200293#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200294int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100295_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200296{
297 PyASCIIObject *ascii;
298 unsigned int kind;
299
300 assert(PyUnicode_Check(op));
301
302 ascii = (PyASCIIObject *)op;
303 kind = ascii->state.kind;
304
Victor Stinnera3b334d2011-10-03 13:53:37 +0200305 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200306 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200307 assert(ascii->state.ready == 1);
308 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200309 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200310 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200311 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200312
Victor Stinnera41463c2011-10-04 01:05:08 +0200313 if (ascii->state.compact == 1) {
314 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200315 assert(kind == PyUnicode_1BYTE_KIND
316 || kind == PyUnicode_2BYTE_KIND
317 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200318 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200319 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200320 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100321 }
322 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200323 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
324
325 data = unicode->data.any;
326 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100327 assert(ascii->length == 0);
328 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200329 assert(ascii->state.compact == 0);
330 assert(ascii->state.ascii == 0);
331 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100332 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200333 assert(ascii->wstr != NULL);
334 assert(data == NULL);
335 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200336 }
337 else {
338 assert(kind == PyUnicode_1BYTE_KIND
339 || kind == PyUnicode_2BYTE_KIND
340 || kind == PyUnicode_4BYTE_KIND);
341 assert(ascii->state.compact == 0);
342 assert(ascii->state.ready == 1);
343 assert(data != NULL);
344 if (ascii->state.ascii) {
345 assert (compact->utf8 == data);
346 assert (compact->utf8_length == ascii->length);
347 }
348 else
349 assert (compact->utf8 != data);
350 }
351 }
352 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200353 if (
354#if SIZEOF_WCHAR_T == 2
355 kind == PyUnicode_2BYTE_KIND
356#else
357 kind == PyUnicode_4BYTE_KIND
358#endif
359 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200360 {
361 assert(ascii->wstr == data);
362 assert(compact->wstr_length == ascii->length);
363 } else
364 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200365 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200366
367 if (compact->utf8 == NULL)
368 assert(compact->utf8_length == 0);
369 if (ascii->wstr == NULL)
370 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200371 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200372 /* check that the best kind is used */
373 if (check_content && kind != PyUnicode_WCHAR_KIND)
374 {
375 Py_ssize_t i;
376 Py_UCS4 maxchar = 0;
377 void *data = PyUnicode_DATA(ascii);
378 for (i=0; i < ascii->length; i++)
379 {
380 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
381 if (ch > maxchar)
382 maxchar = ch;
383 }
384 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100385 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200386 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100387 assert(maxchar <= 255);
388 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200389 else
390 assert(maxchar < 128);
391 }
Victor Stinner77faf692011-11-20 18:56:05 +0100392 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200393 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100394 assert(maxchar <= 0xFFFF);
395 }
396 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200397 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100398 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100399 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200400 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400401 return 1;
402}
Victor Stinner910337b2011-10-03 03:20:16 +0200403#endif
404
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100405static PyObject*
406unicode_result_wchar(PyObject *unicode)
407{
408#ifndef Py_DEBUG
409 Py_ssize_t len;
410
411 assert(Py_REFCNT(unicode) == 1);
412
413 len = _PyUnicode_WSTR_LENGTH(unicode);
414 if (len == 0) {
415 Py_INCREF(unicode_empty);
416 Py_DECREF(unicode);
417 return unicode_empty;
418 }
419
420 if (len == 1) {
421 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
422 if (ch < 256) {
423 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
424 Py_DECREF(unicode);
425 return latin1_char;
426 }
427 }
428
429 if (_PyUnicode_Ready(unicode) < 0) {
430 Py_XDECREF(unicode);
431 return NULL;
432 }
433#else
434 /* don't make the result ready in debug mode to ensure that the caller
435 makes the string ready before using it */
436 assert(_PyUnicode_CheckConsistency(unicode, 1));
437#endif
438 return unicode;
439}
440
441static PyObject*
442unicode_result_ready(PyObject *unicode)
443{
444 Py_ssize_t length;
445
446 length = PyUnicode_GET_LENGTH(unicode);
447 if (length == 0) {
448 if (unicode != unicode_empty) {
449 Py_INCREF(unicode_empty);
450 Py_DECREF(unicode);
451 }
452 return unicode_empty;
453 }
454
455 if (length == 1) {
456 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
457 if (ch < 256) {
458 PyObject *latin1_char = unicode_latin1[ch];
459 if (latin1_char != NULL) {
460 if (unicode != latin1_char) {
461 Py_INCREF(latin1_char);
462 Py_DECREF(unicode);
463 }
464 return latin1_char;
465 }
466 else {
467 assert(_PyUnicode_CheckConsistency(unicode, 1));
468 Py_INCREF(unicode);
469 unicode_latin1[ch] = unicode;
470 return unicode;
471 }
472 }
473 }
474
475 assert(_PyUnicode_CheckConsistency(unicode, 1));
476 return unicode;
477}
478
479static PyObject*
480unicode_result(PyObject *unicode)
481{
482 assert(_PyUnicode_CHECK(unicode));
483 if (PyUnicode_IS_READY(unicode))
484 return unicode_result_ready(unicode);
485 else
486 return unicode_result_wchar(unicode);
487}
488
Victor Stinnerc4b49542011-12-11 22:44:26 +0100489static PyObject*
490unicode_result_unchanged(PyObject *unicode)
491{
492 if (PyUnicode_CheckExact(unicode)) {
493 if (PyUnicode_READY(unicode) < 0)
494 return NULL;
495 Py_INCREF(unicode);
496 return unicode;
497 }
498 else
499 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100500 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100501}
502
Victor Stinner3a50e702011-10-18 21:21:00 +0200503#ifdef HAVE_MBCS
504static OSVERSIONINFOEX winver;
505#endif
506
Thomas Wouters477c8d52006-05-27 19:21:47 +0000507/* --- Bloom Filters ----------------------------------------------------- */
508
509/* stuff to implement simple "bloom filters" for Unicode characters.
510 to keep things simple, we use a single bitmask, using the least 5
511 bits from each unicode characters as the bit index. */
512
513/* the linebreak mask is set up by Unicode_Init below */
514
Antoine Pitrouf068f942010-01-13 14:19:12 +0000515#if LONG_BIT >= 128
516#define BLOOM_WIDTH 128
517#elif LONG_BIT >= 64
518#define BLOOM_WIDTH 64
519#elif LONG_BIT >= 32
520#define BLOOM_WIDTH 32
521#else
522#error "LONG_BIT is smaller than 32"
523#endif
524
Thomas Wouters477c8d52006-05-27 19:21:47 +0000525#define BLOOM_MASK unsigned long
526
527static BLOOM_MASK bloom_linebreak;
528
Antoine Pitrouf068f942010-01-13 14:19:12 +0000529#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
530#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000531
Benjamin Peterson29060642009-01-31 22:14:21 +0000532#define BLOOM_LINEBREAK(ch) \
533 ((ch) < 128U ? ascii_linebreak[(ch)] : \
534 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000535
Alexander Belopolsky40018472011-02-26 01:02:56 +0000536Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200537make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000538{
539 /* calculate simple bloom-style bitmask for a given unicode string */
540
Antoine Pitrouf068f942010-01-13 14:19:12 +0000541 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000542 Py_ssize_t i;
543
544 mask = 0;
545 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200546 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000547
548 return mask;
549}
550
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200551#define BLOOM_MEMBER(mask, chr, str) \
552 (BLOOM(mask, chr) \
553 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000554
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200555/* Compilation of templated routines */
556
557#include "stringlib/asciilib.h"
558#include "stringlib/fastsearch.h"
559#include "stringlib/partition.h"
560#include "stringlib/split.h"
561#include "stringlib/count.h"
562#include "stringlib/find.h"
563#include "stringlib/find_max_char.h"
564#include "stringlib/localeutil.h"
565#include "stringlib/undef.h"
566
567#include "stringlib/ucs1lib.h"
568#include "stringlib/fastsearch.h"
569#include "stringlib/partition.h"
570#include "stringlib/split.h"
571#include "stringlib/count.h"
572#include "stringlib/find.h"
573#include "stringlib/find_max_char.h"
574#include "stringlib/localeutil.h"
575#include "stringlib/undef.h"
576
577#include "stringlib/ucs2lib.h"
578#include "stringlib/fastsearch.h"
579#include "stringlib/partition.h"
580#include "stringlib/split.h"
581#include "stringlib/count.h"
582#include "stringlib/find.h"
583#include "stringlib/find_max_char.h"
584#include "stringlib/localeutil.h"
585#include "stringlib/undef.h"
586
587#include "stringlib/ucs4lib.h"
588#include "stringlib/fastsearch.h"
589#include "stringlib/partition.h"
590#include "stringlib/split.h"
591#include "stringlib/count.h"
592#include "stringlib/find.h"
593#include "stringlib/find_max_char.h"
594#include "stringlib/localeutil.h"
595#include "stringlib/undef.h"
596
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200597#include "stringlib/unicodedefs.h"
598#include "stringlib/fastsearch.h"
599#include "stringlib/count.h"
600#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100601#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200602
Guido van Rossumd57fd912000-03-10 22:53:23 +0000603/* --- Unicode Object ----------------------------------------------------- */
604
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200605static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200606fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200607
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200608Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
609 Py_ssize_t size, Py_UCS4 ch,
610 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200611{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200612 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
613
614 switch (kind) {
615 case PyUnicode_1BYTE_KIND:
616 {
617 Py_UCS1 ch1 = (Py_UCS1) ch;
618 if (ch1 == ch)
619 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
620 else
621 return -1;
622 }
623 case PyUnicode_2BYTE_KIND:
624 {
625 Py_UCS2 ch2 = (Py_UCS2) ch;
626 if (ch2 == ch)
627 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
628 else
629 return -1;
630 }
631 case PyUnicode_4BYTE_KIND:
632 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
633 default:
634 assert(0);
635 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200636 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200637}
638
Victor Stinnerfe226c02011-10-03 03:52:20 +0200639static PyObject*
640resize_compact(PyObject *unicode, Py_ssize_t length)
641{
642 Py_ssize_t char_size;
643 Py_ssize_t struct_size;
644 Py_ssize_t new_size;
645 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100646 PyObject *new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200647 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100648 assert(PyUnicode_IS_COMPACT(unicode));
649
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200650 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100651 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200652 struct_size = sizeof(PyASCIIObject);
653 else
654 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200655 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200656
Victor Stinnerfe226c02011-10-03 03:52:20 +0200657 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
658 PyErr_NoMemory();
659 return NULL;
660 }
661 new_size = (struct_size + (length + 1) * char_size);
662
Victor Stinner84def372011-12-11 20:04:56 +0100663 _Py_DEC_REFTOTAL;
664 _Py_ForgetReference(unicode);
665
666 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
667 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100668 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200669 PyErr_NoMemory();
670 return NULL;
671 }
Victor Stinner84def372011-12-11 20:04:56 +0100672 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200673 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100674
Victor Stinnerfe226c02011-10-03 03:52:20 +0200675 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200676 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200677 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100678 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200679 _PyUnicode_WSTR_LENGTH(unicode) = length;
680 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200681 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
682 length, 0);
683 return unicode;
684}
685
Alexander Belopolsky40018472011-02-26 01:02:56 +0000686static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200687resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000688{
Victor Stinner95663112011-10-04 01:03:50 +0200689 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100690 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200691 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200692 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000693
Victor Stinnerfe226c02011-10-03 03:52:20 +0200694 if (PyUnicode_IS_READY(unicode)) {
695 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200696 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200697 void *data;
698
699 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200700 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200701 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
702 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200703
704 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
705 PyErr_NoMemory();
706 return -1;
707 }
708 new_size = (length + 1) * char_size;
709
Victor Stinner7a9105a2011-12-12 00:13:42 +0100710 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
711 {
712 PyObject_DEL(_PyUnicode_UTF8(unicode));
713 _PyUnicode_UTF8(unicode) = NULL;
714 _PyUnicode_UTF8_LENGTH(unicode) = 0;
715 }
716
Victor Stinnerfe226c02011-10-03 03:52:20 +0200717 data = (PyObject *)PyObject_REALLOC(data, new_size);
718 if (data == NULL) {
719 PyErr_NoMemory();
720 return -1;
721 }
722 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200723 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200724 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200725 _PyUnicode_WSTR_LENGTH(unicode) = length;
726 }
727 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200728 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200729 _PyUnicode_UTF8_LENGTH(unicode) = length;
730 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200731 _PyUnicode_LENGTH(unicode) = length;
732 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200733 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200734 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200735 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200736 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200737 }
Victor Stinner95663112011-10-04 01:03:50 +0200738 assert(_PyUnicode_WSTR(unicode) != NULL);
739
740 /* check for integer overflow */
741 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
742 PyErr_NoMemory();
743 return -1;
744 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100745 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200746 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100747 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200748 if (!wstr) {
749 PyErr_NoMemory();
750 return -1;
751 }
752 _PyUnicode_WSTR(unicode) = wstr;
753 _PyUnicode_WSTR(unicode)[length] = 0;
754 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200755 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000756 return 0;
757}
758
Victor Stinnerfe226c02011-10-03 03:52:20 +0200759static PyObject*
760resize_copy(PyObject *unicode, Py_ssize_t length)
761{
762 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100763 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200764 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100765
766 if (PyUnicode_READY(unicode) < 0)
767 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200768
769 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
770 if (copy == NULL)
771 return NULL;
772
773 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200774 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200775 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200776 }
777 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200778 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100779
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200780 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200781 if (w == NULL)
782 return NULL;
783 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
784 copy_length = Py_MIN(copy_length, length);
785 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
786 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200787 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200788 }
789}
790
Guido van Rossumd57fd912000-03-10 22:53:23 +0000791/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000792 Ux0000 terminated; some code (e.g. new_identifier)
793 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000794
795 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000796 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000797
798*/
799
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200800#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200801static int unicode_old_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200802#endif
803
Alexander Belopolsky40018472011-02-26 01:02:56 +0000804static PyUnicodeObject *
805_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000806{
807 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200808 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000809
Thomas Wouters477c8d52006-05-27 19:21:47 +0000810 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000811 if (length == 0 && unicode_empty != NULL) {
812 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200813 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000814 }
815
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000816 /* Ensure we won't overflow the size. */
817 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
818 return (PyUnicodeObject *)PyErr_NoMemory();
819 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200820 if (length < 0) {
821 PyErr_SetString(PyExc_SystemError,
822 "Negative size passed to _PyUnicode_New");
823 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000824 }
825
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200826#ifdef Py_DEBUG
827 ++unicode_old_new_calls;
828#endif
829
830 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
831 if (unicode == NULL)
832 return NULL;
833 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
834 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
835 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100836 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000837 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100838 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000839 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200840
Jeremy Hyltond8082792003-09-16 19:41:39 +0000841 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000842 * the caller fails before initializing str -- unicode_resize()
843 * reads str[0], and the Keep-Alive optimization can keep memory
844 * allocated for str alive across a call to unicode_dealloc(unicode).
845 * We don't want unicode_resize to read uninitialized memory in
846 * that case.
847 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200848 _PyUnicode_WSTR(unicode)[0] = 0;
849 _PyUnicode_WSTR(unicode)[length] = 0;
850 _PyUnicode_WSTR_LENGTH(unicode) = length;
851 _PyUnicode_HASH(unicode) = -1;
852 _PyUnicode_STATE(unicode).interned = 0;
853 _PyUnicode_STATE(unicode).kind = 0;
854 _PyUnicode_STATE(unicode).compact = 0;
855 _PyUnicode_STATE(unicode).ready = 0;
856 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200857 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200858 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200859 _PyUnicode_UTF8(unicode) = NULL;
860 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100861 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000862 return unicode;
863}
864
Victor Stinnerf42dc442011-10-02 23:33:16 +0200865static const char*
866unicode_kind_name(PyObject *unicode)
867{
Victor Stinner42dfd712011-10-03 14:41:45 +0200868 /* don't check consistency: unicode_kind_name() is called from
869 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200870 if (!PyUnicode_IS_COMPACT(unicode))
871 {
872 if (!PyUnicode_IS_READY(unicode))
873 return "wstr";
874 switch(PyUnicode_KIND(unicode))
875 {
876 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200877 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200878 return "legacy ascii";
879 else
880 return "legacy latin1";
881 case PyUnicode_2BYTE_KIND:
882 return "legacy UCS2";
883 case PyUnicode_4BYTE_KIND:
884 return "legacy UCS4";
885 default:
886 return "<legacy invalid kind>";
887 }
888 }
889 assert(PyUnicode_IS_READY(unicode));
890 switch(PyUnicode_KIND(unicode))
891 {
892 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200893 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200894 return "ascii";
895 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200896 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200897 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200898 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200899 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200900 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200901 default:
902 return "<invalid compact kind>";
903 }
904}
905
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200906#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200907static int unicode_new_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200908
909/* Functions wrapping macros for use in debugger */
910char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200911 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200912}
913
914void *_PyUnicode_compact_data(void *unicode) {
915 return _PyUnicode_COMPACT_DATA(unicode);
916}
917void *_PyUnicode_data(void *unicode){
918 printf("obj %p\n", unicode);
919 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
920 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
921 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
922 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
923 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
924 return PyUnicode_DATA(unicode);
925}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200926
927void
928_PyUnicode_Dump(PyObject *op)
929{
930 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200931 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
932 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
933 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200934
Victor Stinnera849a4b2011-10-03 12:12:11 +0200935 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200936 {
937 if (ascii->state.ascii)
938 data = (ascii + 1);
939 else
940 data = (compact + 1);
941 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200942 else
943 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200944 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
945
Victor Stinnera849a4b2011-10-03 12:12:11 +0200946 if (ascii->wstr == data)
947 printf("shared ");
948 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200949
Victor Stinnera3b334d2011-10-03 13:53:37 +0200950 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200951 printf(" (%zu), ", compact->wstr_length);
952 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
953 printf("shared ");
954 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200955 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200956 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200957}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200958#endif
959
960PyObject *
961PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
962{
963 PyObject *obj;
964 PyCompactUnicodeObject *unicode;
965 void *data;
966 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200967 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200968 Py_ssize_t char_size;
969 Py_ssize_t struct_size;
970
971 /* Optimization for empty strings */
972 if (size == 0 && unicode_empty != NULL) {
973 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200974 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200975 }
976
977#ifdef Py_DEBUG
978 ++unicode_new_new_calls;
979#endif
980
Victor Stinner9e9d6892011-10-04 01:02:02 +0200981 is_ascii = 0;
982 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200983 struct_size = sizeof(PyCompactUnicodeObject);
984 if (maxchar < 128) {
985 kind_state = PyUnicode_1BYTE_KIND;
986 char_size = 1;
987 is_ascii = 1;
988 struct_size = sizeof(PyASCIIObject);
989 }
990 else if (maxchar < 256) {
991 kind_state = PyUnicode_1BYTE_KIND;
992 char_size = 1;
993 }
994 else if (maxchar < 65536) {
995 kind_state = PyUnicode_2BYTE_KIND;
996 char_size = 2;
997 if (sizeof(wchar_t) == 2)
998 is_sharing = 1;
999 }
1000 else {
1001 kind_state = PyUnicode_4BYTE_KIND;
1002 char_size = 4;
1003 if (sizeof(wchar_t) == 4)
1004 is_sharing = 1;
1005 }
1006
1007 /* Ensure we won't overflow the size. */
1008 if (size < 0) {
1009 PyErr_SetString(PyExc_SystemError,
1010 "Negative size passed to PyUnicode_New");
1011 return NULL;
1012 }
1013 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1014 return PyErr_NoMemory();
1015
1016 /* Duplicated allocation code from _PyObject_New() instead of a call to
1017 * PyObject_New() so we are able to allocate space for the object and
1018 * it's data buffer.
1019 */
1020 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1021 if (obj == NULL)
1022 return PyErr_NoMemory();
1023 obj = PyObject_INIT(obj, &PyUnicode_Type);
1024 if (obj == NULL)
1025 return NULL;
1026
1027 unicode = (PyCompactUnicodeObject *)obj;
1028 if (is_ascii)
1029 data = ((PyASCIIObject*)obj) + 1;
1030 else
1031 data = unicode + 1;
1032 _PyUnicode_LENGTH(unicode) = size;
1033 _PyUnicode_HASH(unicode) = -1;
1034 _PyUnicode_STATE(unicode).interned = 0;
1035 _PyUnicode_STATE(unicode).kind = kind_state;
1036 _PyUnicode_STATE(unicode).compact = 1;
1037 _PyUnicode_STATE(unicode).ready = 1;
1038 _PyUnicode_STATE(unicode).ascii = is_ascii;
1039 if (is_ascii) {
1040 ((char*)data)[size] = 0;
1041 _PyUnicode_WSTR(unicode) = NULL;
1042 }
1043 else if (kind_state == PyUnicode_1BYTE_KIND) {
1044 ((char*)data)[size] = 0;
1045 _PyUnicode_WSTR(unicode) = NULL;
1046 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001047 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001048 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001049 }
1050 else {
1051 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001052 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001053 if (kind_state == PyUnicode_2BYTE_KIND)
1054 ((Py_UCS2*)data)[size] = 0;
1055 else /* kind_state == PyUnicode_4BYTE_KIND */
1056 ((Py_UCS4*)data)[size] = 0;
1057 if (is_sharing) {
1058 _PyUnicode_WSTR_LENGTH(unicode) = size;
1059 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1060 }
1061 else {
1062 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1063 _PyUnicode_WSTR(unicode) = NULL;
1064 }
1065 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01001066 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001067 return obj;
1068}
1069
1070#if SIZEOF_WCHAR_T == 2
1071/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1072 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001073 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001074
1075 This function assumes that unicode can hold one more code point than wstr
1076 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001077static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001078unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001079 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001080{
1081 const wchar_t *iter;
1082 Py_UCS4 *ucs4_out;
1083
Victor Stinner910337b2011-10-03 03:20:16 +02001084 assert(unicode != NULL);
1085 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001086 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1087 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1088
1089 for (iter = begin; iter < end; ) {
1090 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1091 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001092 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1093 && (iter+1) < end
1094 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001095 {
Victor Stinner551ac952011-11-29 22:58:13 +01001096 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001097 iter += 2;
1098 }
1099 else {
1100 *ucs4_out++ = *iter;
1101 iter++;
1102 }
1103 }
1104 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1105 _PyUnicode_GET_LENGTH(unicode)));
1106
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001107}
1108#endif
1109
Victor Stinnercd9950f2011-10-02 00:34:53 +02001110static int
Victor Stinner488fa492011-12-12 00:01:39 +01001111unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001112{
Victor Stinner488fa492011-12-12 00:01:39 +01001113 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001114 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001115 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001116 return -1;
1117 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001118 return 0;
1119}
1120
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001121static int
1122_copy_characters(PyObject *to, Py_ssize_t to_start,
1123 PyObject *from, Py_ssize_t from_start,
1124 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001125{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001126 unsigned int from_kind, to_kind;
1127 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001128 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001129
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001130 assert(PyUnicode_Check(from));
1131 assert(PyUnicode_Check(to));
1132 assert(PyUnicode_IS_READY(from));
1133 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001134
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001135 assert(PyUnicode_GET_LENGTH(from) >= how_many);
1136 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1137 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001138
Victor Stinnerf5ca1a22011-09-28 23:54:59 +02001139 if (how_many == 0)
1140 return 0;
1141
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001142 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001143 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001144 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001145 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001146
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001147#ifdef Py_DEBUG
1148 if (!check_maxchar
1149 && (from_kind > to_kind
1150 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001151 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001152 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1153 Py_UCS4 ch;
1154 Py_ssize_t i;
1155 for (i=0; i < how_many; i++) {
1156 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1157 assert(ch <= to_maxchar);
1158 }
1159 }
1160#endif
1161 fast = (from_kind == to_kind);
1162 if (check_maxchar
1163 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1164 {
1165 /* deny latin1 => ascii */
1166 fast = 0;
1167 }
1168
1169 if (fast) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001170 Py_MEMCPY((char*)to_data + to_kind * to_start,
1171 (char*)from_data + from_kind * from_start,
1172 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001173 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001174 else if (from_kind == PyUnicode_1BYTE_KIND
1175 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001176 {
1177 _PyUnicode_CONVERT_BYTES(
1178 Py_UCS1, Py_UCS2,
1179 PyUnicode_1BYTE_DATA(from) + from_start,
1180 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1181 PyUnicode_2BYTE_DATA(to) + to_start
1182 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001183 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001184 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001185 && to_kind == PyUnicode_4BYTE_KIND)
1186 {
1187 _PyUnicode_CONVERT_BYTES(
1188 Py_UCS1, Py_UCS4,
1189 PyUnicode_1BYTE_DATA(from) + from_start,
1190 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1191 PyUnicode_4BYTE_DATA(to) + to_start
1192 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001193 }
1194 else if (from_kind == PyUnicode_2BYTE_KIND
1195 && to_kind == PyUnicode_4BYTE_KIND)
1196 {
1197 _PyUnicode_CONVERT_BYTES(
1198 Py_UCS2, Py_UCS4,
1199 PyUnicode_2BYTE_DATA(from) + from_start,
1200 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1201 PyUnicode_4BYTE_DATA(to) + to_start
1202 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001203 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001204 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001205 /* check if max_char(from substring) <= max_char(to) */
1206 if (from_kind > to_kind
1207 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001208 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001209 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001210 /* slow path to check for character overflow */
1211 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001212 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001213 Py_ssize_t i;
1214
Victor Stinner56c161a2011-10-06 02:47:11 +02001215#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001216 for (i=0; i < how_many; i++) {
1217 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001218 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001219 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1220 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001221#else
1222 if (!check_maxchar) {
1223 for (i=0; i < how_many; i++) {
1224 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1225 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1226 }
1227 }
1228 else {
1229 for (i=0; i < how_many; i++) {
1230 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1231 if (ch > to_maxchar)
1232 return 1;
1233 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1234 }
1235 }
1236#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001237 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001238 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001239 assert(0 && "inconsistent state");
1240 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001241 }
1242 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001243 return 0;
1244}
1245
1246static void
1247copy_characters(PyObject *to, Py_ssize_t to_start,
1248 PyObject *from, Py_ssize_t from_start,
1249 Py_ssize_t how_many)
1250{
1251 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1252}
1253
1254Py_ssize_t
1255PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1256 PyObject *from, Py_ssize_t from_start,
1257 Py_ssize_t how_many)
1258{
1259 int err;
1260
1261 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1262 PyErr_BadInternalCall();
1263 return -1;
1264 }
1265
1266 if (PyUnicode_READY(from))
1267 return -1;
1268 if (PyUnicode_READY(to))
1269 return -1;
1270
1271 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1272 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1273 PyErr_Format(PyExc_SystemError,
1274 "Cannot write %zi characters at %zi "
1275 "in a string of %zi characters",
1276 how_many, to_start, PyUnicode_GET_LENGTH(to));
1277 return -1;
1278 }
1279
1280 if (how_many == 0)
1281 return 0;
1282
Victor Stinner488fa492011-12-12 00:01:39 +01001283 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001284 return -1;
1285
1286 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1287 if (err) {
1288 PyErr_Format(PyExc_SystemError,
1289 "Cannot copy %s characters "
1290 "into a string of %s characters",
1291 unicode_kind_name(from),
1292 unicode_kind_name(to));
1293 return -1;
1294 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001295 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001296}
1297
Victor Stinner17222162011-09-28 22:15:37 +02001298/* Find the maximum code point and count the number of surrogate pairs so a
1299 correct string length can be computed before converting a string to UCS4.
1300 This function counts single surrogates as a character and not as a pair.
1301
1302 Return 0 on success, or -1 on error. */
1303static int
1304find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1305 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001306{
1307 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001308 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001309
Victor Stinnerc53be962011-10-02 21:33:54 +02001310 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001311 *num_surrogates = 0;
1312 *maxchar = 0;
1313
1314 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001315#if SIZEOF_WCHAR_T == 2
Victor Stinnerca4f2072011-11-22 03:38:40 +01001316 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1317 && (iter+1) < end
1318 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001319 {
Victor Stinner8faf8212011-12-08 22:14:11 +01001320 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001321 ++(*num_surrogates);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001322 iter += 2;
1323 }
1324 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001325#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001326 {
1327 ch = *iter;
1328 iter++;
1329 }
1330 if (ch > *maxchar) {
1331 *maxchar = ch;
1332 if (*maxchar > MAX_UNICODE) {
1333 PyErr_Format(PyExc_ValueError,
1334 "character U+%x is not in range [U+0000; U+10ffff]",
1335 ch);
1336 return -1;
1337 }
1338 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001339 }
1340 return 0;
1341}
1342
1343#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001344static int unicode_ready_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001345#endif
1346
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001347int
1348_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001349{
1350 wchar_t *end;
1351 Py_UCS4 maxchar = 0;
1352 Py_ssize_t num_surrogates;
1353#if SIZEOF_WCHAR_T == 2
1354 Py_ssize_t length_wo_surrogates;
1355#endif
1356
Georg Brandl7597add2011-10-05 16:36:47 +02001357 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001358 strings were created using _PyObject_New() and where no canonical
1359 representation (the str field) has been set yet aka strings
1360 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001361 assert(_PyUnicode_CHECK(unicode));
1362 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001363 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001364 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001365 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001366 /* Actually, it should neither be interned nor be anything else: */
1367 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001368
1369#ifdef Py_DEBUG
1370 ++unicode_ready_calls;
1371#endif
1372
1373 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001374 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001375 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001376 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001377
1378 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001379 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1380 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001381 PyErr_NoMemory();
1382 return -1;
1383 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001384 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001385 _PyUnicode_WSTR(unicode), end,
1386 PyUnicode_1BYTE_DATA(unicode));
1387 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1388 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1389 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1390 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001391 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001392 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001393 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001394 }
1395 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001396 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001397 _PyUnicode_UTF8(unicode) = NULL;
1398 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001399 }
1400 PyObject_FREE(_PyUnicode_WSTR(unicode));
1401 _PyUnicode_WSTR(unicode) = NULL;
1402 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1403 }
1404 /* In this case we might have to convert down from 4-byte native
1405 wchar_t to 2-byte unicode. */
1406 else if (maxchar < 65536) {
1407 assert(num_surrogates == 0 &&
1408 "FindMaxCharAndNumSurrogatePairs() messed up");
1409
Victor Stinner506f5922011-09-28 22:34:18 +02001410#if SIZEOF_WCHAR_T == 2
1411 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001412 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001413 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1414 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1415 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001416 _PyUnicode_UTF8(unicode) = NULL;
1417 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001418#else
1419 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001420 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001421 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001422 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001423 PyErr_NoMemory();
1424 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001425 }
Victor Stinner506f5922011-09-28 22:34:18 +02001426 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1427 _PyUnicode_WSTR(unicode), end,
1428 PyUnicode_2BYTE_DATA(unicode));
1429 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1430 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1431 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001432 _PyUnicode_UTF8(unicode) = NULL;
1433 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001434 PyObject_FREE(_PyUnicode_WSTR(unicode));
1435 _PyUnicode_WSTR(unicode) = NULL;
1436 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1437#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001438 }
1439 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1440 else {
1441#if SIZEOF_WCHAR_T == 2
1442 /* in case the native representation is 2-bytes, we need to allocate a
1443 new normalized 4-byte version. */
1444 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001445 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1446 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001447 PyErr_NoMemory();
1448 return -1;
1449 }
1450 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1451 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001452 _PyUnicode_UTF8(unicode) = NULL;
1453 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001454 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1455 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001456 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001457 PyObject_FREE(_PyUnicode_WSTR(unicode));
1458 _PyUnicode_WSTR(unicode) = NULL;
1459 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1460#else
1461 assert(num_surrogates == 0);
1462
Victor Stinnerc3c74152011-10-02 20:39:55 +02001463 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001464 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001465 _PyUnicode_UTF8(unicode) = NULL;
1466 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001467 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1468#endif
1469 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1470 }
1471 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001472 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001473 return 0;
1474}
1475
Alexander Belopolsky40018472011-02-26 01:02:56 +00001476static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001477unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001478{
Walter Dörwald16807132007-05-25 13:52:07 +00001479 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001480 case SSTATE_NOT_INTERNED:
1481 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001482
Benjamin Peterson29060642009-01-31 22:14:21 +00001483 case SSTATE_INTERNED_MORTAL:
1484 /* revive dead object temporarily for DelItem */
1485 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001486 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001487 Py_FatalError(
1488 "deletion of interned string failed");
1489 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001490
Benjamin Peterson29060642009-01-31 22:14:21 +00001491 case SSTATE_INTERNED_IMMORTAL:
1492 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001493
Benjamin Peterson29060642009-01-31 22:14:21 +00001494 default:
1495 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001496 }
1497
Victor Stinner03490912011-10-03 23:45:12 +02001498 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001499 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001500 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001501 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001502 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1503 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001504
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001505 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001506}
1507
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001508#ifdef Py_DEBUG
1509static int
1510unicode_is_singleton(PyObject *unicode)
1511{
1512 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1513 if (unicode == unicode_empty)
1514 return 1;
1515 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1516 {
1517 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1518 if (ch < 256 && unicode_latin1[ch] == unicode)
1519 return 1;
1520 }
1521 return 0;
1522}
1523#endif
1524
Alexander Belopolsky40018472011-02-26 01:02:56 +00001525static int
Victor Stinner488fa492011-12-12 00:01:39 +01001526unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001527{
Victor Stinner488fa492011-12-12 00:01:39 +01001528 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001529 if (Py_REFCNT(unicode) != 1)
1530 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001531 if (_PyUnicode_HASH(unicode) != -1)
1532 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001533 if (PyUnicode_CHECK_INTERNED(unicode))
1534 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001535 if (!PyUnicode_CheckExact(unicode))
1536 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001537#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001538 /* singleton refcount is greater than 1 */
1539 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001540#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001541 return 1;
1542}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001543
Victor Stinnerfe226c02011-10-03 03:52:20 +02001544static int
1545unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1546{
1547 PyObject *unicode;
1548 Py_ssize_t old_length;
1549
1550 assert(p_unicode != NULL);
1551 unicode = *p_unicode;
1552
1553 assert(unicode != NULL);
1554 assert(PyUnicode_Check(unicode));
1555 assert(0 <= length);
1556
Victor Stinner910337b2011-10-03 03:20:16 +02001557 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001558 old_length = PyUnicode_WSTR_LENGTH(unicode);
1559 else
1560 old_length = PyUnicode_GET_LENGTH(unicode);
1561 if (old_length == length)
1562 return 0;
1563
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001564 if (length == 0) {
1565 Py_DECREF(*p_unicode);
1566 *p_unicode = unicode_empty;
1567 Py_INCREF(*p_unicode);
1568 return 0;
1569 }
1570
Victor Stinner488fa492011-12-12 00:01:39 +01001571 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001572 PyObject *copy = resize_copy(unicode, length);
1573 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001574 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001575 Py_DECREF(*p_unicode);
1576 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001577 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001578 }
1579
Victor Stinnerfe226c02011-10-03 03:52:20 +02001580 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001581 PyObject *new_unicode = resize_compact(unicode, length);
1582 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001583 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001584 *p_unicode = new_unicode;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001585 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001586 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001587 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001588 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001589}
1590
Alexander Belopolsky40018472011-02-26 01:02:56 +00001591int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001592PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001593{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001594 PyObject *unicode;
1595 if (p_unicode == NULL) {
1596 PyErr_BadInternalCall();
1597 return -1;
1598 }
1599 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001600 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001601 {
1602 PyErr_BadInternalCall();
1603 return -1;
1604 }
1605 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001606}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001607
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001608static int
Victor Stinner0a045ef2011-11-09 00:02:42 +01001609unicode_widen(PyObject **p_unicode, unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001610{
1611 PyObject *result;
1612 assert(PyUnicode_IS_READY(*p_unicode));
1613 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1614 return 0;
1615 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1616 maxchar);
1617 if (result == NULL)
1618 return -1;
1619 PyUnicode_CopyCharacters(result, 0, *p_unicode, 0,
1620 PyUnicode_GET_LENGTH(*p_unicode));
1621 Py_DECREF(*p_unicode);
1622 *p_unicode = result;
1623 return 0;
1624}
1625
1626static int
1627unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1628 Py_UCS4 ch)
1629{
1630 if (unicode_widen(p_unicode, ch) < 0)
1631 return -1;
1632 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1633 PyUnicode_DATA(*p_unicode),
1634 (*pos)++, ch);
1635 return 0;
1636}
1637
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001638static PyObject*
1639get_latin1_char(unsigned char ch)
1640{
Victor Stinnera464fc12011-10-02 20:39:30 +02001641 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001642 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001643 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001644 if (!unicode)
1645 return NULL;
1646 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001647 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001648 unicode_latin1[ch] = unicode;
1649 }
1650 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001651 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001652}
1653
Alexander Belopolsky40018472011-02-26 01:02:56 +00001654PyObject *
1655PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001656{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001657 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001658 Py_UCS4 maxchar = 0;
1659 Py_ssize_t num_surrogates;
1660
1661 if (u == NULL)
1662 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001663
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001664 /* If the Unicode data is known at construction time, we can apply
1665 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001666
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001667 /* Optimization for empty strings */
1668 if (size == 0 && unicode_empty != NULL) {
1669 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001670 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001671 }
Tim Petersced69f82003-09-16 20:30:58 +00001672
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001673 /* Single character Unicode objects in the Latin-1 range are
1674 shared when using this constructor */
1675 if (size == 1 && *u < 256)
1676 return get_latin1_char((unsigned char)*u);
1677
1678 /* If not empty and not single character, copy the Unicode data
1679 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001680 if (find_maxchar_surrogates(u, u + size,
1681 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001682 return NULL;
1683
Victor Stinner8faf8212011-12-08 22:14:11 +01001684 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001685 if (!unicode)
1686 return NULL;
1687
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001688 switch (PyUnicode_KIND(unicode)) {
1689 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001690 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001691 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1692 break;
1693 case PyUnicode_2BYTE_KIND:
1694#if Py_UNICODE_SIZE == 2
1695 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1696#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001697 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001698 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1699#endif
1700 break;
1701 case PyUnicode_4BYTE_KIND:
1702#if SIZEOF_WCHAR_T == 2
1703 /* This is the only case which has to process surrogates, thus
1704 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001705 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001706#else
1707 assert(num_surrogates == 0);
1708 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1709#endif
1710 break;
1711 default:
1712 assert(0 && "Impossible state");
1713 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001714
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001715 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001716}
1717
Alexander Belopolsky40018472011-02-26 01:02:56 +00001718PyObject *
1719PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001720{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001721 if (size < 0) {
1722 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001723 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001724 return NULL;
1725 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001726 if (u != NULL)
1727 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1728 else
1729 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001730}
1731
Alexander Belopolsky40018472011-02-26 01:02:56 +00001732PyObject *
1733PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001734{
1735 size_t size = strlen(u);
1736 if (size > PY_SSIZE_T_MAX) {
1737 PyErr_SetString(PyExc_OverflowError, "input too long");
1738 return NULL;
1739 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001740 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001741}
1742
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001743PyObject *
1744_PyUnicode_FromId(_Py_Identifier *id)
1745{
1746 if (!id->object) {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001747 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1748 strlen(id->string),
1749 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001750 if (!id->object)
1751 return NULL;
1752 PyUnicode_InternInPlace(&id->object);
1753 assert(!id->next);
1754 id->next = static_strings;
1755 static_strings = id;
1756 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001757 return id->object;
1758}
1759
1760void
1761_PyUnicode_ClearStaticStrings()
1762{
1763 _Py_Identifier *i;
1764 for (i = static_strings; i; i = i->next) {
1765 Py_DECREF(i->object);
1766 i->object = NULL;
1767 i->next = NULL;
1768 }
1769}
1770
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001771/* Internal function, don't check maximum character */
1772
Victor Stinnere57b1c02011-09-28 22:20:48 +02001773static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001774unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001775{
Victor Stinner785938e2011-12-11 20:09:03 +01001776 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001777 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001778#ifdef Py_DEBUG
Victor Stinnere6b2d442011-12-11 21:54:30 +01001779 assert(s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001780#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001781 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001782 }
Victor Stinner785938e2011-12-11 20:09:03 +01001783 unicode = PyUnicode_New(size, 127);
1784 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001785 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001786 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1787 assert(_PyUnicode_CheckConsistency(unicode, 1));
1788 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001789}
1790
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001791static Py_UCS4
1792kind_maxchar_limit(unsigned int kind)
1793{
1794 switch(kind) {
1795 case PyUnicode_1BYTE_KIND:
1796 return 0x80;
1797 case PyUnicode_2BYTE_KIND:
1798 return 0x100;
1799 case PyUnicode_4BYTE_KIND:
1800 return 0x10000;
1801 default:
1802 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001803 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001804 }
1805}
1806
Victor Stinner702c7342011-10-05 13:50:52 +02001807static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001808_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001809{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001810 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001811 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001812
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001813 if (size == 0) {
1814 Py_INCREF(unicode_empty);
1815 return unicode_empty;
1816 }
1817 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001818 if (size == 1)
1819 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001820
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001821 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001822 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001823 if (!res)
1824 return NULL;
1825 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001826 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001827 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001828}
1829
Victor Stinnere57b1c02011-09-28 22:20:48 +02001830static PyObject*
1831_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001832{
1833 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001834 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001835
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001836 if (size == 0) {
1837 Py_INCREF(unicode_empty);
1838 return unicode_empty;
1839 }
1840 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001841 if (size == 1 && u[0] < 256)
Victor Stinner4e101002011-10-11 23:27:52 +02001842 return get_latin1_char((unsigned char)u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001843
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001844 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001845 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001846 if (!res)
1847 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001848 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001849 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001850 else {
1851 _PyUnicode_CONVERT_BYTES(
1852 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1853 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001854 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001855 return res;
1856}
1857
Victor Stinnere57b1c02011-09-28 22:20:48 +02001858static PyObject*
1859_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001860{
1861 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001862 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001863
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001864 if (size == 0) {
1865 Py_INCREF(unicode_empty);
1866 return unicode_empty;
1867 }
1868 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001869 if (size == 1 && u[0] < 256)
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001870 return get_latin1_char((unsigned char)u[0]);
1871
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001872 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001873 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001874 if (!res)
1875 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001876 if (max_char < 256)
1877 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1878 PyUnicode_1BYTE_DATA(res));
1879 else if (max_char < 0x10000)
1880 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1881 PyUnicode_2BYTE_DATA(res));
1882 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001883 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001884 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001885 return res;
1886}
1887
1888PyObject*
1889PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1890{
Victor Stinnercfed46e2011-11-22 01:29:14 +01001891 if (size < 0) {
1892 PyErr_SetString(PyExc_ValueError, "size must be positive");
1893 return NULL;
1894 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001895 switch(kind) {
1896 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001897 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001898 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001899 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001900 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001901 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001902 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02001903 PyErr_SetString(PyExc_SystemError, "invalid kind");
1904 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001905 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001906}
1907
Victor Stinner25a4b292011-10-06 12:31:55 +02001908/* Ensure that a string uses the most efficient storage, if it is not the
1909 case: create a new string with of the right kind. Write NULL into *p_unicode
1910 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001911static void
Victor Stinner25a4b292011-10-06 12:31:55 +02001912unicode_adjust_maxchar(PyObject **p_unicode)
1913{
1914 PyObject *unicode, *copy;
1915 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001916 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02001917 unsigned int kind;
1918
1919 assert(p_unicode != NULL);
1920 unicode = *p_unicode;
1921 assert(PyUnicode_IS_READY(unicode));
1922 if (PyUnicode_IS_ASCII(unicode))
1923 return;
1924
1925 len = PyUnicode_GET_LENGTH(unicode);
1926 kind = PyUnicode_KIND(unicode);
1927 if (kind == PyUnicode_1BYTE_KIND) {
1928 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001929 max_char = ucs1lib_find_max_char(u, u + len);
1930 if (max_char >= 128)
1931 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001932 }
1933 else if (kind == PyUnicode_2BYTE_KIND) {
1934 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001935 max_char = ucs2lib_find_max_char(u, u + len);
1936 if (max_char >= 256)
1937 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001938 }
1939 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001940 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02001941 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001942 max_char = ucs4lib_find_max_char(u, u + len);
1943 if (max_char >= 0x10000)
1944 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001945 }
Victor Stinner25a4b292011-10-06 12:31:55 +02001946 copy = PyUnicode_New(len, max_char);
1947 copy_characters(copy, 0, unicode, 0, len);
1948 Py_DECREF(unicode);
1949 *p_unicode = copy;
1950}
1951
Victor Stinner034f6cf2011-09-30 02:26:44 +02001952PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01001953_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02001954{
Victor Stinner87af4f22011-11-21 23:03:47 +01001955 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001956 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001957
Victor Stinner034f6cf2011-09-30 02:26:44 +02001958 if (!PyUnicode_Check(unicode)) {
1959 PyErr_BadInternalCall();
1960 return NULL;
1961 }
1962 if (PyUnicode_READY(unicode))
1963 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001964
Victor Stinner87af4f22011-11-21 23:03:47 +01001965 length = PyUnicode_GET_LENGTH(unicode);
1966 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001967 if (!copy)
1968 return NULL;
1969 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1970
Victor Stinner87af4f22011-11-21 23:03:47 +01001971 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
1972 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001973 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001974 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001975}
1976
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001977
Victor Stinnerbc603d12011-10-02 01:00:40 +02001978/* Widen Unicode objects to larger buffers. Don't write terminating null
1979 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001980
1981void*
1982_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1983{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001984 Py_ssize_t len;
1985 void *result;
1986 unsigned int skind;
1987
1988 if (PyUnicode_READY(s))
1989 return NULL;
1990
1991 len = PyUnicode_GET_LENGTH(s);
1992 skind = PyUnicode_KIND(s);
1993 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02001994 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001995 return NULL;
1996 }
1997 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001998 case PyUnicode_2BYTE_KIND:
1999 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2000 if (!result)
2001 return PyErr_NoMemory();
2002 assert(skind == PyUnicode_1BYTE_KIND);
2003 _PyUnicode_CONVERT_BYTES(
2004 Py_UCS1, Py_UCS2,
2005 PyUnicode_1BYTE_DATA(s),
2006 PyUnicode_1BYTE_DATA(s) + len,
2007 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002008 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002009 case PyUnicode_4BYTE_KIND:
2010 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2011 if (!result)
2012 return PyErr_NoMemory();
2013 if (skind == PyUnicode_2BYTE_KIND) {
2014 _PyUnicode_CONVERT_BYTES(
2015 Py_UCS2, Py_UCS4,
2016 PyUnicode_2BYTE_DATA(s),
2017 PyUnicode_2BYTE_DATA(s) + len,
2018 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002019 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002020 else {
2021 assert(skind == PyUnicode_1BYTE_KIND);
2022 _PyUnicode_CONVERT_BYTES(
2023 Py_UCS1, Py_UCS4,
2024 PyUnicode_1BYTE_DATA(s),
2025 PyUnicode_1BYTE_DATA(s) + len,
2026 result);
2027 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002028 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002029 default:
2030 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002031 }
Victor Stinner01698042011-10-04 00:04:26 +02002032 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002033 return NULL;
2034}
2035
2036static Py_UCS4*
2037as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2038 int copy_null)
2039{
2040 int kind;
2041 void *data;
2042 Py_ssize_t len, targetlen;
2043 if (PyUnicode_READY(string) == -1)
2044 return NULL;
2045 kind = PyUnicode_KIND(string);
2046 data = PyUnicode_DATA(string);
2047 len = PyUnicode_GET_LENGTH(string);
2048 targetlen = len;
2049 if (copy_null)
2050 targetlen++;
2051 if (!target) {
2052 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2053 PyErr_NoMemory();
2054 return NULL;
2055 }
2056 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2057 if (!target) {
2058 PyErr_NoMemory();
2059 return NULL;
2060 }
2061 }
2062 else {
2063 if (targetsize < targetlen) {
2064 PyErr_Format(PyExc_SystemError,
2065 "string is longer than the buffer");
2066 if (copy_null && 0 < targetsize)
2067 target[0] = 0;
2068 return NULL;
2069 }
2070 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002071 if (kind == PyUnicode_1BYTE_KIND) {
2072 Py_UCS1 *start = (Py_UCS1 *) data;
2073 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002074 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002075 else if (kind == PyUnicode_2BYTE_KIND) {
2076 Py_UCS2 *start = (Py_UCS2 *) data;
2077 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2078 }
2079 else {
2080 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002081 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002082 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002083 if (copy_null)
2084 target[len] = 0;
2085 return target;
2086}
2087
2088Py_UCS4*
2089PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2090 int copy_null)
2091{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002092 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002093 PyErr_BadInternalCall();
2094 return NULL;
2095 }
2096 return as_ucs4(string, target, targetsize, copy_null);
2097}
2098
2099Py_UCS4*
2100PyUnicode_AsUCS4Copy(PyObject *string)
2101{
2102 return as_ucs4(string, NULL, 0, 1);
2103}
2104
2105#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002106
Alexander Belopolsky40018472011-02-26 01:02:56 +00002107PyObject *
2108PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002109{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002110 if (w == NULL) {
Victor Stinner382955f2011-12-11 21:44:00 +01002111 if (size == 0) {
2112 Py_INCREF(unicode_empty);
2113 return unicode_empty;
2114 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002115 PyErr_BadInternalCall();
2116 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002117 }
2118
Martin v. Löwis790465f2008-04-05 20:41:37 +00002119 if (size == -1) {
2120 size = wcslen(w);
2121 }
2122
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002123 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002124}
2125
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002126#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002127
Walter Dörwald346737f2007-05-31 10:44:43 +00002128static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002129makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2130 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002131{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002132 *fmt++ = '%';
2133 if (width) {
2134 if (zeropad)
2135 *fmt++ = '0';
2136 fmt += sprintf(fmt, "%d", width);
2137 }
2138 if (precision)
2139 fmt += sprintf(fmt, ".%d", precision);
2140 if (longflag)
2141 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002142 else if (longlongflag) {
2143 /* longlongflag should only ever be nonzero on machines with
2144 HAVE_LONG_LONG defined */
2145#ifdef HAVE_LONG_LONG
2146 char *f = PY_FORMAT_LONG_LONG;
2147 while (*f)
2148 *fmt++ = *f++;
2149#else
2150 /* we shouldn't ever get here */
2151 assert(0);
2152 *fmt++ = 'l';
2153#endif
2154 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002155 else if (size_tflag) {
2156 char *f = PY_FORMAT_SIZE_T;
2157 while (*f)
2158 *fmt++ = *f++;
2159 }
2160 *fmt++ = c;
2161 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002162}
2163
Victor Stinner96865452011-03-01 23:44:09 +00002164/* helper for PyUnicode_FromFormatV() */
2165
2166static const char*
2167parse_format_flags(const char *f,
2168 int *p_width, int *p_precision,
2169 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2170{
2171 int width, precision, longflag, longlongflag, size_tflag;
2172
2173 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2174 f++;
2175 width = 0;
2176 while (Py_ISDIGIT((unsigned)*f))
2177 width = (width*10) + *f++ - '0';
2178 precision = 0;
2179 if (*f == '.') {
2180 f++;
2181 while (Py_ISDIGIT((unsigned)*f))
2182 precision = (precision*10) + *f++ - '0';
2183 if (*f == '%') {
2184 /* "%.3%s" => f points to "3" */
2185 f--;
2186 }
2187 }
2188 if (*f == '\0') {
2189 /* bogus format "%.1" => go backward, f points to "1" */
2190 f--;
2191 }
2192 if (p_width != NULL)
2193 *p_width = width;
2194 if (p_precision != NULL)
2195 *p_precision = precision;
2196
2197 /* Handle %ld, %lu, %lld and %llu. */
2198 longflag = 0;
2199 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002200 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002201
2202 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002203 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002204 longflag = 1;
2205 ++f;
2206 }
2207#ifdef HAVE_LONG_LONG
2208 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002209 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002210 longlongflag = 1;
2211 f += 2;
2212 }
2213#endif
2214 }
2215 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002216 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002217 size_tflag = 1;
2218 ++f;
2219 }
2220 if (p_longflag != NULL)
2221 *p_longflag = longflag;
2222 if (p_longlongflag != NULL)
2223 *p_longlongflag = longlongflag;
2224 if (p_size_tflag != NULL)
2225 *p_size_tflag = size_tflag;
2226 return f;
2227}
2228
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002229/* maximum number of characters required for output of %ld. 21 characters
2230 allows for 64-bit integers (in decimal) and an optional sign. */
2231#define MAX_LONG_CHARS 21
2232/* maximum number of characters required for output of %lld.
2233 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2234 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2235#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2236
Walter Dörwaldd2034312007-05-18 16:29:38 +00002237PyObject *
2238PyUnicode_FromFormatV(const char *format, va_list vargs)
2239{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002240 va_list count;
2241 Py_ssize_t callcount = 0;
2242 PyObject **callresults = NULL;
2243 PyObject **callresult = NULL;
2244 Py_ssize_t n = 0;
2245 int width = 0;
2246 int precision = 0;
2247 int zeropad;
2248 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002249 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002250 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002251 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002252 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2253 Py_UCS4 argmaxchar;
2254 Py_ssize_t numbersize = 0;
2255 char *numberresults = NULL;
2256 char *numberresult = NULL;
2257 Py_ssize_t i;
2258 int kind;
2259 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002260
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002261 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002262 /* step 1: count the number of %S/%R/%A/%s format specifications
2263 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2264 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002265 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002266 * also estimate a upper bound for all the number formats in the string,
2267 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002268 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002269 for (f = format; *f; f++) {
2270 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002271 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002272 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2273 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2274 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2275 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002276
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002277 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002278#ifdef HAVE_LONG_LONG
2279 if (longlongflag) {
2280 if (width < MAX_LONG_LONG_CHARS)
2281 width = MAX_LONG_LONG_CHARS;
2282 }
2283 else
2284#endif
2285 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2286 including sign. Decimal takes the most space. This
2287 isn't enough for octal. If a width is specified we
2288 need more (which we allocate later). */
2289 if (width < MAX_LONG_CHARS)
2290 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002291
2292 /* account for the size + '\0' to separate numbers
2293 inside of the numberresults buffer */
2294 numbersize += (width + 1);
2295 }
2296 }
2297 else if ((unsigned char)*f > 127) {
2298 PyErr_Format(PyExc_ValueError,
2299 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2300 "string, got a non-ASCII byte: 0x%02x",
2301 (unsigned char)*f);
2302 return NULL;
2303 }
2304 }
2305 /* step 2: allocate memory for the results of
2306 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2307 if (callcount) {
2308 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2309 if (!callresults) {
2310 PyErr_NoMemory();
2311 return NULL;
2312 }
2313 callresult = callresults;
2314 }
2315 /* step 2.5: allocate memory for the results of formating numbers */
2316 if (numbersize) {
2317 numberresults = PyObject_Malloc(numbersize);
2318 if (!numberresults) {
2319 PyErr_NoMemory();
2320 goto fail;
2321 }
2322 numberresult = numberresults;
2323 }
2324
2325 /* step 3: format numbers and figure out how large a buffer we need */
2326 for (f = format; *f; f++) {
2327 if (*f == '%') {
2328 const char* p;
2329 int longflag;
2330 int longlongflag;
2331 int size_tflag;
2332 int numprinted;
2333
2334 p = f;
2335 zeropad = (f[1] == '0');
2336 f = parse_format_flags(f, &width, &precision,
2337 &longflag, &longlongflag, &size_tflag);
2338 switch (*f) {
2339 case 'c':
2340 {
2341 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002342 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002343 n++;
2344 break;
2345 }
2346 case '%':
2347 n++;
2348 break;
2349 case 'i':
2350 case 'd':
2351 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2352 width, precision, *f);
2353 if (longflag)
2354 numprinted = sprintf(numberresult, fmt,
2355 va_arg(count, long));
2356#ifdef HAVE_LONG_LONG
2357 else if (longlongflag)
2358 numprinted = sprintf(numberresult, fmt,
2359 va_arg(count, PY_LONG_LONG));
2360#endif
2361 else if (size_tflag)
2362 numprinted = sprintf(numberresult, fmt,
2363 va_arg(count, Py_ssize_t));
2364 else
2365 numprinted = sprintf(numberresult, fmt,
2366 va_arg(count, int));
2367 n += numprinted;
2368 /* advance by +1 to skip over the '\0' */
2369 numberresult += (numprinted + 1);
2370 assert(*(numberresult - 1) == '\0');
2371 assert(*(numberresult - 2) != '\0');
2372 assert(numprinted >= 0);
2373 assert(numberresult <= numberresults + numbersize);
2374 break;
2375 case 'u':
2376 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2377 width, precision, 'u');
2378 if (longflag)
2379 numprinted = sprintf(numberresult, fmt,
2380 va_arg(count, unsigned long));
2381#ifdef HAVE_LONG_LONG
2382 else if (longlongflag)
2383 numprinted = sprintf(numberresult, fmt,
2384 va_arg(count, unsigned PY_LONG_LONG));
2385#endif
2386 else if (size_tflag)
2387 numprinted = sprintf(numberresult, fmt,
2388 va_arg(count, size_t));
2389 else
2390 numprinted = sprintf(numberresult, fmt,
2391 va_arg(count, unsigned int));
2392 n += numprinted;
2393 numberresult += (numprinted + 1);
2394 assert(*(numberresult - 1) == '\0');
2395 assert(*(numberresult - 2) != '\0');
2396 assert(numprinted >= 0);
2397 assert(numberresult <= numberresults + numbersize);
2398 break;
2399 case 'x':
2400 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2401 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2402 n += numprinted;
2403 numberresult += (numprinted + 1);
2404 assert(*(numberresult - 1) == '\0');
2405 assert(*(numberresult - 2) != '\0');
2406 assert(numprinted >= 0);
2407 assert(numberresult <= numberresults + numbersize);
2408 break;
2409 case 'p':
2410 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2411 /* %p is ill-defined: ensure leading 0x. */
2412 if (numberresult[1] == 'X')
2413 numberresult[1] = 'x';
2414 else if (numberresult[1] != 'x') {
2415 memmove(numberresult + 2, numberresult,
2416 strlen(numberresult) + 1);
2417 numberresult[0] = '0';
2418 numberresult[1] = 'x';
2419 numprinted += 2;
2420 }
2421 n += numprinted;
2422 numberresult += (numprinted + 1);
2423 assert(*(numberresult - 1) == '\0');
2424 assert(*(numberresult - 2) != '\0');
2425 assert(numprinted >= 0);
2426 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002427 break;
2428 case 's':
2429 {
2430 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002431 const char *s = va_arg(count, const char*);
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002432 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002433 if (!str)
2434 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002435 /* since PyUnicode_DecodeUTF8 returns already flexible
2436 unicode objects, there is no need to call ready on them */
2437 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002438 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002439 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002440 /* Remember the str and switch to the next slot */
2441 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002442 break;
2443 }
2444 case 'U':
2445 {
2446 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002447 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002448 if (PyUnicode_READY(obj) == -1)
2449 goto fail;
2450 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002451 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002452 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002453 break;
2454 }
2455 case 'V':
2456 {
2457 PyObject *obj = va_arg(count, PyObject *);
2458 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002459 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002460 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002461 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002462 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002463 if (PyUnicode_READY(obj) == -1)
2464 goto fail;
2465 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002466 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002467 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002468 *callresult++ = NULL;
2469 }
2470 else {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002471 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002472 if (!str_obj)
2473 goto fail;
Victor Stinnere1335c72011-10-04 20:53:03 +02002474 if (PyUnicode_READY(str_obj)) {
2475 Py_DECREF(str_obj);
2476 goto fail;
2477 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002478 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002479 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002480 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002481 *callresult++ = str_obj;
2482 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002483 break;
2484 }
2485 case 'S':
2486 {
2487 PyObject *obj = va_arg(count, PyObject *);
2488 PyObject *str;
2489 assert(obj);
2490 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002491 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002492 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002493 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002494 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002495 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002496 /* Remember the str and switch to the next slot */
2497 *callresult++ = str;
2498 break;
2499 }
2500 case 'R':
2501 {
2502 PyObject *obj = va_arg(count, PyObject *);
2503 PyObject *repr;
2504 assert(obj);
2505 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002506 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002507 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002508 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002509 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002510 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002511 /* Remember the repr and switch to the next slot */
2512 *callresult++ = repr;
2513 break;
2514 }
2515 case 'A':
2516 {
2517 PyObject *obj = va_arg(count, PyObject *);
2518 PyObject *ascii;
2519 assert(obj);
2520 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002521 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002522 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002523 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002524 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002525 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002526 /* Remember the repr and switch to the next slot */
2527 *callresult++ = ascii;
2528 break;
2529 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002530 default:
2531 /* if we stumble upon an unknown
2532 formatting code, copy the rest of
2533 the format string to the output
2534 string. (we cannot just skip the
2535 code, since there's no way to know
2536 what's in the argument list) */
2537 n += strlen(p);
2538 goto expand;
2539 }
2540 } else
2541 n++;
2542 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002543 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002544 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002545 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002546 we don't have to resize the string.
2547 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002548 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002549 if (!string)
2550 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002551 kind = PyUnicode_KIND(string);
2552 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002553 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002554 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002555
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002556 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002557 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002558 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002559
2560 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002561 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2562 /* checking for == because the last argument could be a empty
2563 string, which causes i to point to end, the assert at the end of
2564 the loop */
2565 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002566
Benjamin Peterson14339b62009-01-31 16:36:08 +00002567 switch (*f) {
2568 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002569 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002570 const int ordinal = va_arg(vargs, int);
2571 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002572 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002573 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002574 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002575 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002576 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002577 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002578 case 'p':
2579 /* unused, since we already have the result */
2580 if (*f == 'p')
2581 (void) va_arg(vargs, void *);
2582 else
2583 (void) va_arg(vargs, int);
2584 /* extract the result from numberresults and append. */
2585 for (; *numberresult; ++i, ++numberresult)
2586 PyUnicode_WRITE(kind, data, i, *numberresult);
2587 /* skip over the separating '\0' */
2588 assert(*numberresult == '\0');
2589 numberresult++;
2590 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002591 break;
2592 case 's':
2593 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002594 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002595 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002596 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002597 size = PyUnicode_GET_LENGTH(*callresult);
2598 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002599 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002600 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002601 /* We're done with the unicode()/repr() => forget it */
2602 Py_DECREF(*callresult);
2603 /* switch to next unicode()/repr() result */
2604 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002605 break;
2606 }
2607 case 'U':
2608 {
2609 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002610 Py_ssize_t size;
2611 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2612 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002613 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002614 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002615 break;
2616 }
2617 case 'V':
2618 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002619 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002620 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002621 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002622 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002623 size = PyUnicode_GET_LENGTH(obj);
2624 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002625 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002626 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002627 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002628 size = PyUnicode_GET_LENGTH(*callresult);
2629 assert(PyUnicode_KIND(*callresult) <=
2630 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002631 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002632 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002633 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002634 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002635 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002636 break;
2637 }
2638 case 'S':
2639 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002640 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002641 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002642 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002643 /* unused, since we already have the result */
2644 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002645 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002646 copy_characters(string, i, *callresult, 0, size);
2647 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002648 /* We're done with the unicode()/repr() => forget it */
2649 Py_DECREF(*callresult);
2650 /* switch to next unicode()/repr() result */
2651 ++callresult;
2652 break;
2653 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002654 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002655 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002656 break;
2657 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002658 for (; *p; ++p, ++i)
2659 PyUnicode_WRITE(kind, data, i, *p);
2660 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002661 goto end;
2662 }
Victor Stinner1205f272010-09-11 00:54:47 +00002663 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002664 else {
2665 assert(i < PyUnicode_GET_LENGTH(string));
2666 PyUnicode_WRITE(kind, data, i++, *f);
2667 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002668 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002669 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002670
Benjamin Peterson29060642009-01-31 22:14:21 +00002671 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002672 if (callresults)
2673 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002674 if (numberresults)
2675 PyObject_Free(numberresults);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002676 return unicode_result(string);
Benjamin Peterson29060642009-01-31 22:14:21 +00002677 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002678 if (callresults) {
2679 PyObject **callresult2 = callresults;
2680 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002681 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002682 ++callresult2;
2683 }
2684 PyObject_Free(callresults);
2685 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002686 if (numberresults)
2687 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002688 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002689}
2690
Walter Dörwaldd2034312007-05-18 16:29:38 +00002691PyObject *
2692PyUnicode_FromFormat(const char *format, ...)
2693{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002694 PyObject* ret;
2695 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002696
2697#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002698 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002699#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002700 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002701#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002702 ret = PyUnicode_FromFormatV(format, vargs);
2703 va_end(vargs);
2704 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002705}
2706
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002707#ifdef HAVE_WCHAR_H
2708
Victor Stinner5593d8a2010-10-02 11:11:27 +00002709/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2710 convert a Unicode object to a wide character string.
2711
Victor Stinnerd88d9832011-09-06 02:00:05 +02002712 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002713 character) required to convert the unicode object. Ignore size argument.
2714
Victor Stinnerd88d9832011-09-06 02:00:05 +02002715 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002716 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002717 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002718static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002719unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002720 wchar_t *w,
2721 Py_ssize_t size)
2722{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002723 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002724 const wchar_t *wstr;
2725
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002726 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002727 if (wstr == NULL)
2728 return -1;
2729
Victor Stinner5593d8a2010-10-02 11:11:27 +00002730 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002731 if (size > res)
2732 size = res + 1;
2733 else
2734 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002735 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002736 return res;
2737 }
2738 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002739 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002740}
2741
2742Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002743PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002744 wchar_t *w,
2745 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002746{
2747 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002748 PyErr_BadInternalCall();
2749 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002750 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002751 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002752}
2753
Victor Stinner137c34c2010-09-29 10:25:54 +00002754wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002755PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002756 Py_ssize_t *size)
2757{
2758 wchar_t* buffer;
2759 Py_ssize_t buflen;
2760
2761 if (unicode == NULL) {
2762 PyErr_BadInternalCall();
2763 return NULL;
2764 }
2765
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002766 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002767 if (buflen == -1)
2768 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002769 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002770 PyErr_NoMemory();
2771 return NULL;
2772 }
2773
Victor Stinner137c34c2010-09-29 10:25:54 +00002774 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2775 if (buffer == NULL) {
2776 PyErr_NoMemory();
2777 return NULL;
2778 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002779 buflen = unicode_aswidechar(unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002780 if (buflen == -1)
2781 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002782 if (size != NULL)
2783 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002784 return buffer;
2785}
2786
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002787#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002788
Alexander Belopolsky40018472011-02-26 01:02:56 +00002789PyObject *
2790PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002791{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002792 PyObject *v;
Victor Stinner8faf8212011-12-08 22:14:11 +01002793 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002794 PyErr_SetString(PyExc_ValueError,
2795 "chr() arg not in range(0x110000)");
2796 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002797 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002798
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002799 if (ordinal < 256)
2800 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002801
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002802 v = PyUnicode_New(1, ordinal);
2803 if (v == NULL)
2804 return NULL;
2805 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002806 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002807 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002808}
2809
Alexander Belopolsky40018472011-02-26 01:02:56 +00002810PyObject *
2811PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002812{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002813 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002814 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002815 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002816 if (PyUnicode_READY(obj))
2817 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002818 Py_INCREF(obj);
2819 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002820 }
2821 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002822 /* For a Unicode subtype that's not a Unicode object,
2823 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002824 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002825 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002826 PyErr_Format(PyExc_TypeError,
2827 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002828 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002829 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002830}
2831
Alexander Belopolsky40018472011-02-26 01:02:56 +00002832PyObject *
2833PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002834 const char *encoding,
2835 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002836{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002837 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002838 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002839
Guido van Rossumd57fd912000-03-10 22:53:23 +00002840 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002841 PyErr_BadInternalCall();
2842 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002843 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002844
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002845 /* Decoding bytes objects is the most common case and should be fast */
2846 if (PyBytes_Check(obj)) {
2847 if (PyBytes_GET_SIZE(obj) == 0) {
2848 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002849 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002850 }
2851 else {
2852 v = PyUnicode_Decode(
2853 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2854 encoding, errors);
2855 }
2856 return v;
2857 }
2858
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002859 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002860 PyErr_SetString(PyExc_TypeError,
2861 "decoding str is not supported");
2862 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002863 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002864
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002865 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2866 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2867 PyErr_Format(PyExc_TypeError,
2868 "coercing to str: need bytes, bytearray "
2869 "or buffer-like object, %.80s found",
2870 Py_TYPE(obj)->tp_name);
2871 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002872 }
Tim Petersced69f82003-09-16 20:30:58 +00002873
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002874 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002875 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002876 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002877 }
Tim Petersced69f82003-09-16 20:30:58 +00002878 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002879 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002880
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002881 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002882 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002883}
2884
Victor Stinner600d3be2010-06-10 12:00:55 +00002885/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002886 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2887 1 on success. */
2888static int
2889normalize_encoding(const char *encoding,
2890 char *lower,
2891 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002892{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002893 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002894 char *l;
2895 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002896
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002897 if (encoding == NULL) {
2898 strcpy(lower, "utf-8");
2899 return 1;
2900 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002901 e = encoding;
2902 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002903 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002904 while (*e) {
2905 if (l == l_end)
2906 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002907 if (Py_ISUPPER(*e)) {
2908 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002909 }
2910 else if (*e == '_') {
2911 *l++ = '-';
2912 e++;
2913 }
2914 else {
2915 *l++ = *e++;
2916 }
2917 }
2918 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002919 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002920}
2921
Alexander Belopolsky40018472011-02-26 01:02:56 +00002922PyObject *
2923PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002924 Py_ssize_t size,
2925 const char *encoding,
2926 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002927{
2928 PyObject *buffer = NULL, *unicode;
2929 Py_buffer info;
2930 char lower[11]; /* Enough for any encoding shortcut */
2931
Fred Drakee4315f52000-05-09 19:53:39 +00002932 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002933 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002934 if ((strcmp(lower, "utf-8") == 0) ||
2935 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002936 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00002937 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002938 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002939 (strcmp(lower, "iso-8859-1") == 0))
2940 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002941#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002942 else if (strcmp(lower, "mbcs") == 0)
2943 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002944#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002945 else if (strcmp(lower, "ascii") == 0)
2946 return PyUnicode_DecodeASCII(s, size, errors);
2947 else if (strcmp(lower, "utf-16") == 0)
2948 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2949 else if (strcmp(lower, "utf-32") == 0)
2950 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2951 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002952
2953 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002954 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002955 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002956 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002957 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002958 if (buffer == NULL)
2959 goto onError;
2960 unicode = PyCodec_Decode(buffer, encoding, errors);
2961 if (unicode == NULL)
2962 goto onError;
2963 if (!PyUnicode_Check(unicode)) {
2964 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002965 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002966 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002967 Py_DECREF(unicode);
2968 goto onError;
2969 }
2970 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002971 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00002972
Benjamin Peterson29060642009-01-31 22:14:21 +00002973 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002974 Py_XDECREF(buffer);
2975 return NULL;
2976}
2977
Alexander Belopolsky40018472011-02-26 01:02:56 +00002978PyObject *
2979PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002980 const char *encoding,
2981 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002982{
2983 PyObject *v;
2984
2985 if (!PyUnicode_Check(unicode)) {
2986 PyErr_BadArgument();
2987 goto onError;
2988 }
2989
2990 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002991 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002992
2993 /* Decode via the codec registry */
2994 v = PyCodec_Decode(unicode, encoding, errors);
2995 if (v == NULL)
2996 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002997 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002998
Benjamin Peterson29060642009-01-31 22:14:21 +00002999 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003000 return NULL;
3001}
3002
Alexander Belopolsky40018472011-02-26 01:02:56 +00003003PyObject *
3004PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003005 const char *encoding,
3006 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003007{
3008 PyObject *v;
3009
3010 if (!PyUnicode_Check(unicode)) {
3011 PyErr_BadArgument();
3012 goto onError;
3013 }
3014
3015 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003016 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003017
3018 /* Decode via the codec registry */
3019 v = PyCodec_Decode(unicode, encoding, errors);
3020 if (v == NULL)
3021 goto onError;
3022 if (!PyUnicode_Check(v)) {
3023 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003024 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003025 Py_TYPE(v)->tp_name);
3026 Py_DECREF(v);
3027 goto onError;
3028 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003029 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003030
Benjamin Peterson29060642009-01-31 22:14:21 +00003031 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003032 return NULL;
3033}
3034
Alexander Belopolsky40018472011-02-26 01:02:56 +00003035PyObject *
3036PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003037 Py_ssize_t size,
3038 const char *encoding,
3039 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003040{
3041 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003042
Guido van Rossumd57fd912000-03-10 22:53:23 +00003043 unicode = PyUnicode_FromUnicode(s, size);
3044 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003045 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003046 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3047 Py_DECREF(unicode);
3048 return v;
3049}
3050
Alexander Belopolsky40018472011-02-26 01:02:56 +00003051PyObject *
3052PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003053 const char *encoding,
3054 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003055{
3056 PyObject *v;
3057
3058 if (!PyUnicode_Check(unicode)) {
3059 PyErr_BadArgument();
3060 goto onError;
3061 }
3062
3063 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003064 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003065
3066 /* Encode via the codec registry */
3067 v = PyCodec_Encode(unicode, encoding, errors);
3068 if (v == NULL)
3069 goto onError;
3070 return v;
3071
Benjamin Peterson29060642009-01-31 22:14:21 +00003072 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003073 return NULL;
3074}
3075
Victor Stinnerad158722010-10-27 00:25:46 +00003076PyObject *
3077PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003078{
Victor Stinner99b95382011-07-04 14:23:54 +02003079#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003080 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003081#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003082 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003083#else
Victor Stinner793b5312011-04-27 00:24:21 +02003084 PyInterpreterState *interp = PyThreadState_GET()->interp;
3085 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3086 cannot use it to encode and decode filenames before it is loaded. Load
3087 the Python codec requires to encode at least its own filename. Use the C
3088 version of the locale codec until the codec registry is initialized and
3089 the Python codec is loaded.
3090
3091 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3092 cannot only rely on it: check also interp->fscodec_initialized for
3093 subinterpreters. */
3094 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003095 return PyUnicode_AsEncodedString(unicode,
3096 Py_FileSystemDefaultEncoding,
3097 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003098 }
3099 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003100 /* locale encoding with surrogateescape */
3101 wchar_t *wchar;
3102 char *bytes;
3103 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00003104 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003105
3106 wchar = PyUnicode_AsWideCharString(unicode, NULL);
3107 if (wchar == NULL)
3108 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003109 bytes = _Py_wchar2char(wchar, &error_pos);
3110 if (bytes == NULL) {
3111 if (error_pos != (size_t)-1) {
3112 char *errmsg = strerror(errno);
3113 PyObject *exc = NULL;
3114 if (errmsg == NULL)
3115 errmsg = "Py_wchar2char() failed";
3116 raise_encode_exception(&exc,
Martin v. Löwis12be46c2011-11-04 19:04:15 +01003117 "filesystemencoding", unicode,
Victor Stinner2f02a512010-11-08 22:43:46 +00003118 error_pos, error_pos+1,
3119 errmsg);
3120 Py_XDECREF(exc);
3121 }
3122 else
3123 PyErr_NoMemory();
3124 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003125 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003126 }
3127 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003128
3129 bytes_obj = PyBytes_FromString(bytes);
3130 PyMem_Free(bytes);
3131 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00003132 }
Victor Stinnerad158722010-10-27 00:25:46 +00003133#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003134}
3135
Alexander Belopolsky40018472011-02-26 01:02:56 +00003136PyObject *
3137PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003138 const char *encoding,
3139 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003140{
3141 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003142 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003143
Guido van Rossumd57fd912000-03-10 22:53:23 +00003144 if (!PyUnicode_Check(unicode)) {
3145 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003146 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003147 }
Fred Drakee4315f52000-05-09 19:53:39 +00003148
Fred Drakee4315f52000-05-09 19:53:39 +00003149 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003150 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003151 if ((strcmp(lower, "utf-8") == 0) ||
3152 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003153 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003154 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003155 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003156 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003157 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003158 }
Victor Stinner37296e82010-06-10 13:36:23 +00003159 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003160 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003161 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003162 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003163#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003164 else if (strcmp(lower, "mbcs") == 0)
3165 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003166#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003167 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003168 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003169 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003170
3171 /* Encode via the codec registry */
3172 v = PyCodec_Encode(unicode, encoding, errors);
3173 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003174 return NULL;
3175
3176 /* The normal path */
3177 if (PyBytes_Check(v))
3178 return v;
3179
3180 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003181 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003182 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003183 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003184
3185 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3186 "encoder %s returned bytearray instead of bytes",
3187 encoding);
3188 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003189 Py_DECREF(v);
3190 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003191 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003192
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003193 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3194 Py_DECREF(v);
3195 return b;
3196 }
3197
3198 PyErr_Format(PyExc_TypeError,
3199 "encoder did not return a bytes object (type=%.400s)",
3200 Py_TYPE(v)->tp_name);
3201 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003202 return NULL;
3203}
3204
Alexander Belopolsky40018472011-02-26 01:02:56 +00003205PyObject *
3206PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003207 const char *encoding,
3208 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003209{
3210 PyObject *v;
3211
3212 if (!PyUnicode_Check(unicode)) {
3213 PyErr_BadArgument();
3214 goto onError;
3215 }
3216
3217 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003218 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003219
3220 /* Encode via the codec registry */
3221 v = PyCodec_Encode(unicode, encoding, errors);
3222 if (v == NULL)
3223 goto onError;
3224 if (!PyUnicode_Check(v)) {
3225 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003226 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003227 Py_TYPE(v)->tp_name);
3228 Py_DECREF(v);
3229 goto onError;
3230 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003231 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003232
Benjamin Peterson29060642009-01-31 22:14:21 +00003233 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003234 return NULL;
3235}
3236
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003237PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003238PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003239 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003240 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3241}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003242
Christian Heimes5894ba72007-11-04 11:43:14 +00003243PyObject*
3244PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3245{
Victor Stinner99b95382011-07-04 14:23:54 +02003246#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003247 return PyUnicode_DecodeMBCS(s, size, NULL);
3248#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003249 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003250#else
Victor Stinner793b5312011-04-27 00:24:21 +02003251 PyInterpreterState *interp = PyThreadState_GET()->interp;
3252 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3253 cannot use it to encode and decode filenames before it is loaded. Load
3254 the Python codec requires to encode at least its own filename. Use the C
3255 version of the locale codec until the codec registry is initialized and
3256 the Python codec is loaded.
3257
3258 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3259 cannot only rely on it: check also interp->fscodec_initialized for
3260 subinterpreters. */
3261 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003262 return PyUnicode_Decode(s, size,
3263 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003264 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003265 }
3266 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003267 /* locale encoding with surrogateescape */
3268 wchar_t *wchar;
3269 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00003270 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003271
3272 if (s[size] != '\0' || size != strlen(s)) {
3273 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3274 return NULL;
3275 }
3276
Victor Stinner168e1172010-10-16 23:16:16 +00003277 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003278 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00003279 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003280
Victor Stinner168e1172010-10-16 23:16:16 +00003281 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003282 PyMem_Free(wchar);
3283 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003284 }
Victor Stinnerad158722010-10-27 00:25:46 +00003285#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003286}
3287
Martin v. Löwis011e8422009-05-05 04:43:17 +00003288
3289int
3290PyUnicode_FSConverter(PyObject* arg, void* addr)
3291{
3292 PyObject *output = NULL;
3293 Py_ssize_t size;
3294 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003295 if (arg == NULL) {
3296 Py_DECREF(*(PyObject**)addr);
3297 return 1;
3298 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003299 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003300 output = arg;
3301 Py_INCREF(output);
3302 }
3303 else {
3304 arg = PyUnicode_FromObject(arg);
3305 if (!arg)
3306 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003307 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003308 Py_DECREF(arg);
3309 if (!output)
3310 return 0;
3311 if (!PyBytes_Check(output)) {
3312 Py_DECREF(output);
3313 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3314 return 0;
3315 }
3316 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003317 size = PyBytes_GET_SIZE(output);
3318 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003319 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003320 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003321 Py_DECREF(output);
3322 return 0;
3323 }
3324 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003325 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003326}
3327
3328
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003329int
3330PyUnicode_FSDecoder(PyObject* arg, void* addr)
3331{
3332 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003333 if (arg == NULL) {
3334 Py_DECREF(*(PyObject**)addr);
3335 return 1;
3336 }
3337 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003338 if (PyUnicode_READY(arg))
3339 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003340 output = arg;
3341 Py_INCREF(output);
3342 }
3343 else {
3344 arg = PyBytes_FromObject(arg);
3345 if (!arg)
3346 return 0;
3347 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3348 PyBytes_GET_SIZE(arg));
3349 Py_DECREF(arg);
3350 if (!output)
3351 return 0;
3352 if (!PyUnicode_Check(output)) {
3353 Py_DECREF(output);
3354 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3355 return 0;
3356 }
3357 }
Victor Stinner065836e2011-10-27 01:56:33 +02003358 if (PyUnicode_READY(output) < 0) {
3359 Py_DECREF(output);
3360 return 0;
3361 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003362 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003363 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003364 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3365 Py_DECREF(output);
3366 return 0;
3367 }
3368 *(PyObject**)addr = output;
3369 return Py_CLEANUP_SUPPORTED;
3370}
3371
3372
Martin v. Löwis5b222132007-06-10 09:51:05 +00003373char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003374PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003375{
Christian Heimesf3863112007-11-22 07:46:41 +00003376 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003377
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003378 if (!PyUnicode_Check(unicode)) {
3379 PyErr_BadArgument();
3380 return NULL;
3381 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003382 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003383 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003384
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003385 if (PyUnicode_UTF8(unicode) == NULL) {
3386 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003387 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3388 if (bytes == NULL)
3389 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003390 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3391 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003392 Py_DECREF(bytes);
3393 return NULL;
3394 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003395 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3396 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3397 PyBytes_AS_STRING(bytes),
3398 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003399 Py_DECREF(bytes);
3400 }
3401
3402 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003403 *psize = PyUnicode_UTF8_LENGTH(unicode);
3404 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003405}
3406
3407char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003408PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003409{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003410 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3411}
3412
3413#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02003414static int unicode_as_unicode_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003415#endif
3416
3417
3418Py_UNICODE *
3419PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3420{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003421 const unsigned char *one_byte;
3422#if SIZEOF_WCHAR_T == 4
3423 const Py_UCS2 *two_bytes;
3424#else
3425 const Py_UCS4 *four_bytes;
3426 const Py_UCS4 *ucs4_end;
3427 Py_ssize_t num_surrogates;
3428#endif
3429 wchar_t *w;
3430 wchar_t *wchar_end;
3431
3432 if (!PyUnicode_Check(unicode)) {
3433 PyErr_BadArgument();
3434 return NULL;
3435 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003436 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003437 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003438 assert(_PyUnicode_KIND(unicode) != 0);
3439 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003440
3441#ifdef Py_DEBUG
3442 ++unicode_as_unicode_calls;
3443#endif
3444
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003445 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003446#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003447 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3448 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003449 num_surrogates = 0;
3450
3451 for (; four_bytes < ucs4_end; ++four_bytes) {
3452 if (*four_bytes > 0xFFFF)
3453 ++num_surrogates;
3454 }
3455
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003456 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3457 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3458 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003459 PyErr_NoMemory();
3460 return NULL;
3461 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003462 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003463
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003464 w = _PyUnicode_WSTR(unicode);
3465 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3466 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003467 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3468 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003469 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003470 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003471 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3472 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003473 }
3474 else
3475 *w = *four_bytes;
3476
3477 if (w > wchar_end) {
3478 assert(0 && "Miscalculated string end");
3479 }
3480 }
3481 *w = 0;
3482#else
3483 /* sizeof(wchar_t) == 4 */
3484 Py_FatalError("Impossible unicode object state, wstr and str "
3485 "should share memory already.");
3486 return NULL;
3487#endif
3488 }
3489 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003490 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3491 (_PyUnicode_LENGTH(unicode) + 1));
3492 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003493 PyErr_NoMemory();
3494 return NULL;
3495 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003496 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3497 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3498 w = _PyUnicode_WSTR(unicode);
3499 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003500
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003501 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3502 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003503 for (; w < wchar_end; ++one_byte, ++w)
3504 *w = *one_byte;
3505 /* null-terminate the wstr */
3506 *w = 0;
3507 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003508 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003509#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003510 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003511 for (; w < wchar_end; ++two_bytes, ++w)
3512 *w = *two_bytes;
3513 /* null-terminate the wstr */
3514 *w = 0;
3515#else
3516 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003517 PyObject_FREE(_PyUnicode_WSTR(unicode));
3518 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003519 Py_FatalError("Impossible unicode object state, wstr "
3520 "and str should share memory already.");
3521 return NULL;
3522#endif
3523 }
3524 else {
3525 assert(0 && "This should never happen.");
3526 }
3527 }
3528 }
3529 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003530 *size = PyUnicode_WSTR_LENGTH(unicode);
3531 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003532}
3533
Alexander Belopolsky40018472011-02-26 01:02:56 +00003534Py_UNICODE *
3535PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003536{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003537 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003538}
3539
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003540
Alexander Belopolsky40018472011-02-26 01:02:56 +00003541Py_ssize_t
3542PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003543{
3544 if (!PyUnicode_Check(unicode)) {
3545 PyErr_BadArgument();
3546 goto onError;
3547 }
3548 return PyUnicode_GET_SIZE(unicode);
3549
Benjamin Peterson29060642009-01-31 22:14:21 +00003550 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003551 return -1;
3552}
3553
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003554Py_ssize_t
3555PyUnicode_GetLength(PyObject *unicode)
3556{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003557 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003558 PyErr_BadArgument();
3559 return -1;
3560 }
3561
3562 return PyUnicode_GET_LENGTH(unicode);
3563}
3564
3565Py_UCS4
3566PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3567{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003568 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3569 PyErr_BadArgument();
3570 return (Py_UCS4)-1;
3571 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003572 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003573 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003574 return (Py_UCS4)-1;
3575 }
3576 return PyUnicode_READ_CHAR(unicode, index);
3577}
3578
3579int
3580PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3581{
3582 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003583 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003584 return -1;
3585 }
Victor Stinner488fa492011-12-12 00:01:39 +01003586 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003587 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003588 PyErr_SetString(PyExc_IndexError, "string index out of range");
3589 return -1;
3590 }
Victor Stinner488fa492011-12-12 00:01:39 +01003591 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003592 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003593 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3594 index, ch);
3595 return 0;
3596}
3597
Alexander Belopolsky40018472011-02-26 01:02:56 +00003598const char *
3599PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003600{
Victor Stinner42cb4622010-09-01 19:39:01 +00003601 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003602}
3603
Victor Stinner554f3f02010-06-16 23:33:54 +00003604/* create or adjust a UnicodeDecodeError */
3605static void
3606make_decode_exception(PyObject **exceptionObject,
3607 const char *encoding,
3608 const char *input, Py_ssize_t length,
3609 Py_ssize_t startpos, Py_ssize_t endpos,
3610 const char *reason)
3611{
3612 if (*exceptionObject == NULL) {
3613 *exceptionObject = PyUnicodeDecodeError_Create(
3614 encoding, input, length, startpos, endpos, reason);
3615 }
3616 else {
3617 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3618 goto onError;
3619 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3620 goto onError;
3621 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3622 goto onError;
3623 }
3624 return;
3625
3626onError:
3627 Py_DECREF(*exceptionObject);
3628 *exceptionObject = NULL;
3629}
3630
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003631/* error handling callback helper:
3632 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003633 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003634 and adjust various state variables.
3635 return 0 on success, -1 on error
3636*/
3637
Alexander Belopolsky40018472011-02-26 01:02:56 +00003638static int
3639unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003640 const char *encoding, const char *reason,
3641 const char **input, const char **inend, Py_ssize_t *startinpos,
3642 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003643 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003644{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003645 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003646
3647 PyObject *restuple = NULL;
3648 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01003649 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003650 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003651 Py_ssize_t requiredsize;
3652 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003653 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003654 int res = -1;
3655
Victor Stinner596a6c42011-11-09 00:02:18 +01003656 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
3657 outsize = PyUnicode_GET_LENGTH(*output);
3658 else
3659 outsize = _PyUnicode_WSTR_LENGTH(*output);
3660
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003661 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003662 *errorHandler = PyCodec_LookupError(errors);
3663 if (*errorHandler == NULL)
3664 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003665 }
3666
Victor Stinner554f3f02010-06-16 23:33:54 +00003667 make_decode_exception(exceptionObject,
3668 encoding,
3669 *input, *inend - *input,
3670 *startinpos, *endinpos,
3671 reason);
3672 if (*exceptionObject == NULL)
3673 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003674
3675 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3676 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003677 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003678 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003679 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003680 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003681 }
3682 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003683 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003684 if (PyUnicode_READY(repunicode) < 0)
3685 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003686
3687 /* Copy back the bytes variables, which might have been modified by the
3688 callback */
3689 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3690 if (!inputobj)
3691 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003692 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003693 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003694 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003695 *input = PyBytes_AS_STRING(inputobj);
3696 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003697 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003698 /* we can DECREF safely, as the exception has another reference,
3699 so the object won't go away. */
3700 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003701
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003702 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003703 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003704 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003705 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3706 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003707 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003708
Victor Stinner596a6c42011-11-09 00:02:18 +01003709 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
3710 /* need more space? (at least enough for what we
3711 have+the replacement+the rest of the string (starting
3712 at the new input position), so we won't have to check space
3713 when there are no errors in the rest of the string) */
3714 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
3715 requiredsize = *outpos + replen + insize-newpos;
3716 if (requiredsize > outsize) {
3717 if (requiredsize<2*outsize)
3718 requiredsize = 2*outsize;
3719 if (unicode_resize(output, requiredsize) < 0)
3720 goto onError;
3721 }
3722 if (unicode_widen(output, PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003723 goto onError;
Victor Stinner596a6c42011-11-09 00:02:18 +01003724 copy_characters(*output, *outpos, repunicode, 0, replen);
3725 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003726 }
Victor Stinner596a6c42011-11-09 00:02:18 +01003727 else {
3728 wchar_t *repwstr;
3729 Py_ssize_t repwlen;
3730 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
3731 if (repwstr == NULL)
3732 goto onError;
3733 /* need more space? (at least enough for what we
3734 have+the replacement+the rest of the string (starting
3735 at the new input position), so we won't have to check space
3736 when there are no errors in the rest of the string) */
3737 requiredsize = *outpos + repwlen + insize-newpos;
3738 if (requiredsize > outsize) {
3739 if (requiredsize < 2*outsize)
3740 requiredsize = 2*outsize;
3741 if (unicode_resize(output, requiredsize) < 0)
3742 goto onError;
3743 }
3744 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
3745 *outpos += repwlen;
3746 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003747 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003748 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003749
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003750 /* we made it! */
3751 res = 0;
3752
Benjamin Peterson29060642009-01-31 22:14:21 +00003753 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003754 Py_XDECREF(restuple);
3755 return res;
3756}
3757
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003758/* --- UTF-7 Codec -------------------------------------------------------- */
3759
Antoine Pitrou244651a2009-05-04 18:56:13 +00003760/* See RFC2152 for details. We encode conservatively and decode liberally. */
3761
3762/* Three simple macros defining base-64. */
3763
3764/* Is c a base-64 character? */
3765
3766#define IS_BASE64(c) \
3767 (((c) >= 'A' && (c) <= 'Z') || \
3768 ((c) >= 'a' && (c) <= 'z') || \
3769 ((c) >= '0' && (c) <= '9') || \
3770 (c) == '+' || (c) == '/')
3771
3772/* given that c is a base-64 character, what is its base-64 value? */
3773
3774#define FROM_BASE64(c) \
3775 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3776 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3777 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3778 (c) == '+' ? 62 : 63)
3779
3780/* What is the base-64 character of the bottom 6 bits of n? */
3781
3782#define TO_BASE64(n) \
3783 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3784
3785/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3786 * decoded as itself. We are permissive on decoding; the only ASCII
3787 * byte not decoding to itself is the + which begins a base64
3788 * string. */
3789
3790#define DECODE_DIRECT(c) \
3791 ((c) <= 127 && (c) != '+')
3792
3793/* The UTF-7 encoder treats ASCII characters differently according to
3794 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3795 * the above). See RFC2152. This array identifies these different
3796 * sets:
3797 * 0 : "Set D"
3798 * alphanumeric and '(),-./:?
3799 * 1 : "Set O"
3800 * !"#$%&*;<=>@[]^_`{|}
3801 * 2 : "whitespace"
3802 * ht nl cr sp
3803 * 3 : special (must be base64 encoded)
3804 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3805 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003806
Tim Petersced69f82003-09-16 20:30:58 +00003807static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003808char utf7_category[128] = {
3809/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3810 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3811/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3812 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3813/* sp ! " # $ % & ' ( ) * + , - . / */
3814 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3815/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3816 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3817/* @ A B C D E F G H I J K L M N O */
3818 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3819/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3820 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3821/* ` a b c d e f g h i j k l m n o */
3822 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3823/* p q r s t u v w x y z { | } ~ del */
3824 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003825};
3826
Antoine Pitrou244651a2009-05-04 18:56:13 +00003827/* ENCODE_DIRECT: this character should be encoded as itself. The
3828 * answer depends on whether we are encoding set O as itself, and also
3829 * on whether we are encoding whitespace as itself. RFC2152 makes it
3830 * clear that the answers to these questions vary between
3831 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003832
Antoine Pitrou244651a2009-05-04 18:56:13 +00003833#define ENCODE_DIRECT(c, directO, directWS) \
3834 ((c) < 128 && (c) > 0 && \
3835 ((utf7_category[(c)] == 0) || \
3836 (directWS && (utf7_category[(c)] == 2)) || \
3837 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003838
Alexander Belopolsky40018472011-02-26 01:02:56 +00003839PyObject *
3840PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003841 Py_ssize_t size,
3842 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003843{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003844 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3845}
3846
Antoine Pitrou244651a2009-05-04 18:56:13 +00003847/* The decoder. The only state we preserve is our read position,
3848 * i.e. how many characters we have consumed. So if we end in the
3849 * middle of a shift sequence we have to back off the read position
3850 * and the output to the beginning of the sequence, otherwise we lose
3851 * all the shift state (seen bits, number of bits seen, high
3852 * surrogate). */
3853
Alexander Belopolsky40018472011-02-26 01:02:56 +00003854PyObject *
3855PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003856 Py_ssize_t size,
3857 const char *errors,
3858 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003859{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003860 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003861 Py_ssize_t startinpos;
3862 Py_ssize_t endinpos;
3863 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003864 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01003865 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003866 const char *errmsg = "";
3867 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003868 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003869 unsigned int base64bits = 0;
3870 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01003871 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003872 PyObject *errorHandler = NULL;
3873 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003874
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003875 /* Start off assuming it's all ASCII. Widen later as necessary. */
3876 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003877 if (!unicode)
3878 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003879 if (size == 0) {
3880 if (consumed)
3881 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01003882 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003883 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003884
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003885 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003886 e = s + size;
3887
3888 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003889 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003890 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003891 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003892
Antoine Pitrou244651a2009-05-04 18:56:13 +00003893 if (inShift) { /* in a base-64 section */
3894 if (IS_BASE64(ch)) { /* consume a base-64 character */
3895 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3896 base64bits += 6;
3897 s++;
3898 if (base64bits >= 16) {
3899 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01003900 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00003901 base64bits -= 16;
3902 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3903 if (surrogate) {
3904 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01003905 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
3906 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003907 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
3908 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003909 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01003910 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003911 }
3912 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01003913 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
3914 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003915 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003916 }
3917 }
Victor Stinner551ac952011-11-29 22:58:13 +01003918 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003919 /* first surrogate */
3920 surrogate = outCh;
3921 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003922 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003923 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
3924 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003925 }
3926 }
3927 }
3928 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003929 inShift = 0;
3930 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003931 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01003932 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
3933 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01003934 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003935 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003936 if (base64bits > 0) { /* left-over bits */
3937 if (base64bits >= 6) {
3938 /* We've seen at least one base-64 character */
3939 errmsg = "partial character in shift sequence";
3940 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003941 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003942 else {
3943 /* Some bits remain; they should be zero */
3944 if (base64buffer != 0) {
3945 errmsg = "non-zero padding bits in shift sequence";
3946 goto utf7Error;
3947 }
3948 }
3949 }
3950 if (ch != '-') {
3951 /* '-' is absorbed; other terminating
3952 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003953 if (unicode_putchar(&unicode, &outpos, ch) < 0)
3954 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003955 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003956 }
3957 }
3958 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003959 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003960 s++; /* consume '+' */
3961 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003962 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003963 if (unicode_putchar(&unicode, &outpos, '+') < 0)
3964 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003965 }
3966 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003967 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003968 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003969 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003970 }
3971 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003972 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003973 if (unicode_putchar(&unicode, &outpos, ch) < 0)
3974 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003975 s++;
3976 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003977 else {
3978 startinpos = s-starts;
3979 s++;
3980 errmsg = "unexpected special character";
3981 goto utf7Error;
3982 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003983 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003984utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003985 endinpos = s-starts;
3986 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003987 errors, &errorHandler,
3988 "utf7", errmsg,
3989 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003990 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003991 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003992 }
3993
Antoine Pitrou244651a2009-05-04 18:56:13 +00003994 /* end of string */
3995
3996 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3997 /* if we're in an inconsistent state, that's an error */
3998 if (surrogate ||
3999 (base64bits >= 6) ||
4000 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004001 endinpos = size;
4002 if (unicode_decode_call_errorhandler(
4003 errors, &errorHandler,
4004 "utf7", "unterminated shift sequence",
4005 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004006 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004007 goto onError;
4008 if (s < e)
4009 goto restart;
4010 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004011 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004012
4013 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004014 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004015 if (inShift) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004016 outpos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004017 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004018 }
4019 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004020 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004021 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004022 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004023
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004024 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004025 goto onError;
4026
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004027 Py_XDECREF(errorHandler);
4028 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004029 return unicode_result(unicode);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004030
Benjamin Peterson29060642009-01-31 22:14:21 +00004031 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004032 Py_XDECREF(errorHandler);
4033 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004034 Py_DECREF(unicode);
4035 return NULL;
4036}
4037
4038
Alexander Belopolsky40018472011-02-26 01:02:56 +00004039PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004040_PyUnicode_EncodeUTF7(PyObject *str,
4041 int base64SetO,
4042 int base64WhiteSpace,
4043 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004044{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004045 int kind;
4046 void *data;
4047 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004048 PyObject *v;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004049 Py_ssize_t allocated;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004050 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004051 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004052 unsigned int base64bits = 0;
4053 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004054 char * out;
4055 char * start;
4056
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004057 if (PyUnicode_READY(str) < 0)
4058 return NULL;
4059 kind = PyUnicode_KIND(str);
4060 data = PyUnicode_DATA(str);
4061 len = PyUnicode_GET_LENGTH(str);
4062
4063 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004064 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004065
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004066 /* It might be possible to tighten this worst case */
4067 allocated = 8 * len;
4068 if (allocated / 8 != len)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004069 return PyErr_NoMemory();
4070
Antoine Pitrou244651a2009-05-04 18:56:13 +00004071 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004072 if (v == NULL)
4073 return NULL;
4074
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004075 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004076 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004077 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004078
Antoine Pitrou244651a2009-05-04 18:56:13 +00004079 if (inShift) {
4080 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4081 /* shifting out */
4082 if (base64bits) { /* output remaining bits */
4083 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4084 base64buffer = 0;
4085 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004086 }
4087 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004088 /* Characters not in the BASE64 set implicitly unshift the sequence
4089 so no '-' is required, except if the character is itself a '-' */
4090 if (IS_BASE64(ch) || ch == '-') {
4091 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004092 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004093 *out++ = (char) ch;
4094 }
4095 else {
4096 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004097 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004098 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004099 else { /* not in a shift sequence */
4100 if (ch == '+') {
4101 *out++ = '+';
4102 *out++ = '-';
4103 }
4104 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4105 *out++ = (char) ch;
4106 }
4107 else {
4108 *out++ = '+';
4109 inShift = 1;
4110 goto encode_char;
4111 }
4112 }
4113 continue;
4114encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004115 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004116 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004117
Antoine Pitrou244651a2009-05-04 18:56:13 +00004118 /* code first surrogate */
4119 base64bits += 16;
4120 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4121 while (base64bits >= 6) {
4122 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4123 base64bits -= 6;
4124 }
4125 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004126 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004127 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004128 base64bits += 16;
4129 base64buffer = (base64buffer << 16) | ch;
4130 while (base64bits >= 6) {
4131 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4132 base64bits -= 6;
4133 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004134 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004135 if (base64bits)
4136 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4137 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004138 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004139 if (_PyBytes_Resize(&v, out - start) < 0)
4140 return NULL;
4141 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004142}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004143PyObject *
4144PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4145 Py_ssize_t size,
4146 int base64SetO,
4147 int base64WhiteSpace,
4148 const char *errors)
4149{
4150 PyObject *result;
4151 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4152 if (tmp == NULL)
4153 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004154 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004155 base64WhiteSpace, errors);
4156 Py_DECREF(tmp);
4157 return result;
4158}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004159
Antoine Pitrou244651a2009-05-04 18:56:13 +00004160#undef IS_BASE64
4161#undef FROM_BASE64
4162#undef TO_BASE64
4163#undef DECODE_DIRECT
4164#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004165
Guido van Rossumd57fd912000-03-10 22:53:23 +00004166/* --- UTF-8 Codec -------------------------------------------------------- */
4167
Tim Petersced69f82003-09-16 20:30:58 +00004168static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004169char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004170 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4171 illegal prefix. See RFC 3629 for details */
4172 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4173 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004174 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004175 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4176 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4177 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4178 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004179 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4180 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004181 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4182 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004183 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4184 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4185 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4186 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4187 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004188};
4189
Alexander Belopolsky40018472011-02-26 01:02:56 +00004190PyObject *
4191PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004192 Py_ssize_t size,
4193 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004194{
Walter Dörwald69652032004-09-07 20:24:22 +00004195 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4196}
4197
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004198#include "stringlib/ucs1lib.h"
4199#include "stringlib/codecs.h"
4200#include "stringlib/undef.h"
4201
4202#include "stringlib/ucs2lib.h"
4203#include "stringlib/codecs.h"
4204#include "stringlib/undef.h"
4205
4206#include "stringlib/ucs4lib.h"
4207#include "stringlib/codecs.h"
4208#include "stringlib/undef.h"
4209
Antoine Pitrouab868312009-01-10 15:40:25 +00004210/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4211#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4212
4213/* Mask to quickly check whether a C 'long' contains a
4214 non-ASCII, UTF8-encoded char. */
4215#if (SIZEOF_LONG == 8)
4216# define ASCII_CHAR_MASK 0x8080808080808080L
4217#elif (SIZEOF_LONG == 4)
4218# define ASCII_CHAR_MASK 0x80808080L
4219#else
4220# error C 'long' size should be either 4 or 8!
4221#endif
4222
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004223/* Scans a UTF-8 string and returns the maximum character to be expected
4224 and the size of the decoded unicode string.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004225
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004226 This function doesn't check for errors, these checks are performed in
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004227 PyUnicode_DecodeUTF8Stateful.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004228 */
4229static Py_UCS4
Victor Stinnera1d12bb2011-12-11 21:53:09 +01004230utf8_scanner(const unsigned char *p, Py_ssize_t string_size, Py_ssize_t *unicode_size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004231{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004232 Py_ssize_t char_count = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004233 const unsigned char *end = p + string_size;
4234 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004235
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004236 assert(unicode_size != NULL);
4237
4238 /* By having a cascade of independent loops which fallback onto each
4239 other, we minimize the amount of work done in the average loop
4240 iteration, and we also maximize the CPU's ability to predict
4241 branches correctly (because a given condition will have always the
4242 same boolean outcome except perhaps in the last iteration of the
4243 corresponding loop).
4244 In the general case this brings us rather close to decoding
4245 performance pre-PEP 393, despite the two-pass decoding.
4246
4247 Note that the pure ASCII loop is not duplicated once a non-ASCII
4248 character has been encountered. It is actually a pessimization (by
4249 a significant factor) to use this loop on text with many non-ASCII
4250 characters, and it is important to avoid bad performance on valid
4251 utf-8 data (invalid utf-8 being a different can of worms).
4252 */
4253
4254 /* ASCII */
4255 for (; p < end; ++p) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004256 /* Only check value if it's not a ASCII char... */
4257 if (*p < 0x80) {
4258 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4259 an explanation. */
4260 if (!((size_t) p & LONG_PTR_MASK)) {
4261 /* Help register allocation */
4262 register const unsigned char *_p = p;
4263 while (_p < aligned_end) {
4264 unsigned long value = *(unsigned long *) _p;
4265 if (value & ASCII_CHAR_MASK)
4266 break;
4267 _p += SIZEOF_LONG;
4268 char_count += SIZEOF_LONG;
4269 }
4270 p = _p;
4271 if (p == end)
4272 break;
4273 }
4274 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004275 if (*p < 0x80)
4276 ++char_count;
4277 else
4278 goto _ucs1loop;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004279 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004280 *unicode_size = char_count;
4281 return 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004282
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004283_ucs1loop:
4284 for (; p < end; ++p) {
4285 if (*p < 0xc4)
4286 char_count += ((*p & 0xc0) != 0x80);
4287 else
4288 goto _ucs2loop;
4289 }
4290 *unicode_size = char_count;
4291 return 255;
4292
4293_ucs2loop:
4294 for (; p < end; ++p) {
4295 if (*p < 0xf0)
4296 char_count += ((*p & 0xc0) != 0x80);
4297 else
4298 goto _ucs4loop;
4299 }
4300 *unicode_size = char_count;
4301 return 65535;
4302
4303_ucs4loop:
4304 for (; p < end; ++p) {
4305 char_count += ((*p & 0xc0) != 0x80);
4306 }
4307 *unicode_size = char_count;
4308 return 65537;
4309}
4310
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004311/* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
Victor Stinner785938e2011-12-11 20:09:03 +01004312 in case of errors. Implicit parameters: unicode, kind, data, onError.
4313 Potential resizing overallocates, so the result needs to shrink at the end.
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004314*/
Victor Stinner785938e2011-12-11 20:09:03 +01004315#define WRITE_MAYBE_FAIL(index, value) \
4316 do { \
4317 Py_ssize_t pos = index; \
4318 if (pos > PyUnicode_GET_LENGTH(unicode) && \
4319 unicode_resize(&unicode, pos + pos/8) < 0) \
4320 goto onError; \
4321 if (unicode_putchar(&unicode, &pos, value) < 0) \
4322 goto onError; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004323 } while (0)
4324
Victor Stinnerbf6e5602011-12-12 01:53:47 +01004325static PyObject *
Victor Stinner785938e2011-12-11 20:09:03 +01004326decode_utf8_errors(const char *starts,
4327 Py_ssize_t size,
4328 const char *errors,
4329 Py_ssize_t *consumed,
4330 const char *s,
4331 PyObject *unicode,
4332 Py_ssize_t i)
Walter Dörwald69652032004-09-07 20:24:22 +00004333{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004334 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004335 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004336 Py_ssize_t startinpos;
4337 Py_ssize_t endinpos;
Victor Stinner785938e2011-12-11 20:09:03 +01004338 const char *e = starts + size;
4339 const char *aligned_end;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004340 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004341 PyObject *errorHandler = NULL;
4342 PyObject *exc = NULL;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004343
Antoine Pitrouab868312009-01-10 15:40:25 +00004344 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004345
4346 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004347 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004348
4349 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004350 /* Fast path for runs of ASCII characters. Given that common UTF-8
4351 input will consist of an overwhelming majority of ASCII
4352 characters, we try to optimize for this case by checking
4353 as many characters as a C 'long' can contain.
4354 First, check if we can do an aligned read, as most CPUs have
4355 a penalty for unaligned reads.
4356 */
4357 if (!((size_t) s & LONG_PTR_MASK)) {
4358 /* Help register allocation */
4359 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004360 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004361 while (_s < aligned_end) {
4362 /* Read a whole long at a time (either 4 or 8 bytes),
4363 and do a fast unrolled copy if it only contains ASCII
4364 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004365 unsigned long value = *(unsigned long *) _s;
4366 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004367 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004368 WRITE_MAYBE_FAIL(_i+0, _s[0]);
4369 WRITE_MAYBE_FAIL(_i+1, _s[1]);
4370 WRITE_MAYBE_FAIL(_i+2, _s[2]);
4371 WRITE_MAYBE_FAIL(_i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004372#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004373 WRITE_MAYBE_FAIL(_i+4, _s[4]);
4374 WRITE_MAYBE_FAIL(_i+5, _s[5]);
4375 WRITE_MAYBE_FAIL(_i+6, _s[6]);
4376 WRITE_MAYBE_FAIL(_i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004377#endif
4378 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004379 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004380 }
4381 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004382 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004383 if (s == e)
4384 break;
4385 ch = (unsigned char)*s;
4386 }
4387 }
4388
4389 if (ch < 0x80) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004390 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004391 s++;
4392 continue;
4393 }
4394
4395 n = utf8_code_length[ch];
4396
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004397 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004398 if (consumed)
4399 break;
4400 else {
4401 errmsg = "unexpected end of data";
4402 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004403 endinpos = startinpos+1;
4404 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4405 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004406 goto utf8Error;
4407 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004408 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004409
4410 switch (n) {
4411
4412 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004413 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004414 startinpos = s-starts;
4415 endinpos = startinpos+1;
4416 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004417
4418 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004419 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004420 startinpos = s-starts;
4421 endinpos = startinpos+1;
4422 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004423
4424 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004425 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004426 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004427 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004428 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004429 goto utf8Error;
4430 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004431 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004432 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004433 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004434 break;
4435
4436 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004437 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4438 will result in surrogates in range d800-dfff. Surrogates are
4439 not valid UTF-8 so they are rejected.
4440 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4441 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004442 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004443 (s[2] & 0xc0) != 0x80 ||
4444 ((unsigned char)s[0] == 0xE0 &&
4445 (unsigned char)s[1] < 0xA0) ||
4446 ((unsigned char)s[0] == 0xED &&
4447 (unsigned char)s[1] > 0x9F)) {
4448 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004449 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004450 endinpos = startinpos + 1;
4451
4452 /* if s[1] first two bits are 1 and 0, then the invalid
4453 continuation byte is s[2], so increment endinpos by 1,
4454 if not, s[1] is invalid and endinpos doesn't need to
4455 be incremented. */
4456 if ((s[1] & 0xC0) == 0x80)
4457 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004458 goto utf8Error;
4459 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004460 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004461 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004462 WRITE_MAYBE_FAIL(i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004463 break;
4464
4465 case 4:
4466 if ((s[1] & 0xc0) != 0x80 ||
4467 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004468 (s[3] & 0xc0) != 0x80 ||
4469 ((unsigned char)s[0] == 0xF0 &&
4470 (unsigned char)s[1] < 0x90) ||
4471 ((unsigned char)s[0] == 0xF4 &&
4472 (unsigned char)s[1] > 0x8F)) {
4473 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004474 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004475 endinpos = startinpos + 1;
4476 if ((s[1] & 0xC0) == 0x80) {
4477 endinpos++;
4478 if ((s[2] & 0xC0) == 0x80)
4479 endinpos++;
4480 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004481 goto utf8Error;
4482 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004483 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004484 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01004485 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Ezio Melotti57221d02010-07-01 07:32:02 +00004486
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004487 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004488 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004489 }
4490 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004491 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004492
Benjamin Peterson29060642009-01-31 22:14:21 +00004493 utf8Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00004494 if (unicode_decode_call_errorhandler(
4495 errors, &errorHandler,
4496 "utf8", errmsg,
4497 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004498 &unicode, &i))
Benjamin Peterson29060642009-01-31 22:14:21 +00004499 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004500 /* Update data because unicode_decode_call_errorhandler might have
4501 re-created or resized the unicode object. */
Benjamin Peterson29060642009-01-31 22:14:21 +00004502 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004503 }
Walter Dörwald69652032004-09-07 20:24:22 +00004504 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004505 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004506
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004507 /* Adjust length and ready string when it contained errors and
4508 is of the old resizable kind. */
Victor Stinner785938e2011-12-11 20:09:03 +01004509 if (unicode_resize(&unicode, i) < 0)
4510 goto onError;
4511 unicode_adjust_maxchar(&unicode);
4512 if (unicode == NULL)
4513 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004514
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004515 Py_XDECREF(errorHandler);
4516 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004517 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01004518 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004519
Benjamin Peterson29060642009-01-31 22:14:21 +00004520 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004521 Py_XDECREF(errorHandler);
4522 Py_XDECREF(exc);
Victor Stinner785938e2011-12-11 20:09:03 +01004523 Py_XDECREF(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004524 return NULL;
4525}
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004526#undef WRITE_MAYBE_FAIL
Antoine Pitrouab868312009-01-10 15:40:25 +00004527
Victor Stinner785938e2011-12-11 20:09:03 +01004528PyObject *
4529PyUnicode_DecodeUTF8Stateful(const char *s,
4530 Py_ssize_t size,
4531 const char *errors,
4532 Py_ssize_t *consumed)
4533{
4534 Py_UCS4 maxchar = 0;
4535 Py_ssize_t unicode_size;
4536 int has_errors = 0;
4537 PyObject *unicode;
4538 int kind;
4539 void *data;
4540 const char *starts = s;
4541 const char *e;
4542 Py_ssize_t i;
4543
4544 if (size == 0) {
4545 if (consumed)
4546 *consumed = 0;
Victor Stinner382955f2011-12-11 21:44:00 +01004547 Py_INCREF(unicode_empty);
4548 return unicode_empty;
Victor Stinner785938e2011-12-11 20:09:03 +01004549 }
4550
Victor Stinnera1d12bb2011-12-11 21:53:09 +01004551 maxchar = utf8_scanner((const unsigned char *)s, size, &unicode_size);
Victor Stinner785938e2011-12-11 20:09:03 +01004552
4553 /* When the string is ASCII only, just use memcpy and return.
4554 unicode_size may be != size if there is an incomplete UTF-8
4555 sequence at the end of the ASCII block. */
4556 if (maxchar < 128 && size == unicode_size) {
4557 if (consumed)
4558 *consumed = size;
4559 return unicode_fromascii(s, size);
4560 }
4561
4562 unicode = PyUnicode_New(unicode_size, maxchar);
4563 if (!unicode)
4564 return NULL;
4565 kind = PyUnicode_KIND(unicode);
4566 data = PyUnicode_DATA(unicode);
4567
4568 /* Unpack UTF-8 encoded data */
4569 i = 0;
4570 e = starts + size;
4571 switch (kind) {
4572 case PyUnicode_1BYTE_KIND:
4573 has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
4574 break;
4575 case PyUnicode_2BYTE_KIND:
4576 has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
4577 break;
4578 case PyUnicode_4BYTE_KIND:
4579 has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
4580 break;
4581 }
4582 if (!has_errors) {
4583 /* Ensure the unicode size calculation was correct */
4584 assert(i == unicode_size);
4585 assert(s == e);
4586 if (consumed)
4587 *consumed = size;
4588 return unicode;
4589 }
4590
4591 /* In case of errors, maxchar and size computation might be incorrect;
4592 code below refits and resizes as necessary. */
4593 return decode_utf8_errors(starts, size, errors, consumed, s, unicode, i);
4594}
4595
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004596#ifdef __APPLE__
4597
4598/* Simplified UTF-8 decoder using surrogateescape error handler,
4599 used to decode the command line arguments on Mac OS X. */
4600
4601wchar_t*
4602_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4603{
4604 int n;
4605 const char *e;
4606 wchar_t *unicode, *p;
4607
4608 /* Note: size will always be longer than the resulting Unicode
4609 character count */
4610 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4611 PyErr_NoMemory();
4612 return NULL;
4613 }
4614 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4615 if (!unicode)
4616 return NULL;
4617
4618 /* Unpack UTF-8 encoded data */
4619 p = unicode;
4620 e = s + size;
4621 while (s < e) {
4622 Py_UCS4 ch = (unsigned char)*s;
4623
4624 if (ch < 0x80) {
4625 *p++ = (wchar_t)ch;
4626 s++;
4627 continue;
4628 }
4629
4630 n = utf8_code_length[ch];
4631 if (s + n > e) {
4632 goto surrogateescape;
4633 }
4634
4635 switch (n) {
4636 case 0:
4637 case 1:
4638 goto surrogateescape;
4639
4640 case 2:
4641 if ((s[1] & 0xc0) != 0x80)
4642 goto surrogateescape;
4643 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4644 assert ((ch > 0x007F) && (ch <= 0x07FF));
4645 *p++ = (wchar_t)ch;
4646 break;
4647
4648 case 3:
4649 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4650 will result in surrogates in range d800-dfff. Surrogates are
4651 not valid UTF-8 so they are rejected.
4652 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4653 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4654 if ((s[1] & 0xc0) != 0x80 ||
4655 (s[2] & 0xc0) != 0x80 ||
4656 ((unsigned char)s[0] == 0xE0 &&
4657 (unsigned char)s[1] < 0xA0) ||
4658 ((unsigned char)s[0] == 0xED &&
4659 (unsigned char)s[1] > 0x9F)) {
4660
4661 goto surrogateescape;
4662 }
4663 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4664 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004665 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004666 break;
4667
4668 case 4:
4669 if ((s[1] & 0xc0) != 0x80 ||
4670 (s[2] & 0xc0) != 0x80 ||
4671 (s[3] & 0xc0) != 0x80 ||
4672 ((unsigned char)s[0] == 0xF0 &&
4673 (unsigned char)s[1] < 0x90) ||
4674 ((unsigned char)s[0] == 0xF4 &&
4675 (unsigned char)s[1] > 0x8F)) {
4676 goto surrogateescape;
4677 }
4678 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4679 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01004680 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004681
4682#if SIZEOF_WCHAR_T == 4
4683 *p++ = (wchar_t)ch;
4684#else
4685 /* compute and append the two surrogates: */
Victor Stinner551ac952011-11-29 22:58:13 +01004686 *p++ = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4687 *p++ = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004688#endif
4689 break;
4690 }
4691 s += n;
4692 continue;
4693
4694 surrogateescape:
4695 *p++ = 0xDC00 + ch;
4696 s++;
4697 }
4698 *p = L'\0';
4699 return unicode;
4700}
4701
4702#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004703
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004704/* Primary internal function which creates utf8 encoded bytes objects.
4705
4706 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004707 and allocate exactly as much space needed at the end. Else allocate the
4708 maximum possible needed (4 result bytes per Unicode character), and return
4709 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004710*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004711PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004712_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004713{
Tim Peters602f7402002-04-27 18:03:26 +00004714#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004715
Guido van Rossum98297ee2007-11-06 21:34:58 +00004716 Py_ssize_t i; /* index into s of next input byte */
4717 PyObject *result; /* result string object */
4718 char *p; /* next free byte in output buffer */
4719 Py_ssize_t nallocated; /* number of result bytes allocated */
4720 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004721 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004722 PyObject *errorHandler = NULL;
4723 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004724 int kind;
4725 void *data;
4726 Py_ssize_t size;
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004727 PyObject *rep = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004728
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004729 if (!PyUnicode_Check(unicode)) {
4730 PyErr_BadArgument();
4731 return NULL;
4732 }
4733
4734 if (PyUnicode_READY(unicode) == -1)
4735 return NULL;
4736
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004737 if (PyUnicode_UTF8(unicode))
4738 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4739 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004740
4741 kind = PyUnicode_KIND(unicode);
4742 data = PyUnicode_DATA(unicode);
4743 size = PyUnicode_GET_LENGTH(unicode);
4744
Tim Peters602f7402002-04-27 18:03:26 +00004745 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004746
Tim Peters602f7402002-04-27 18:03:26 +00004747 if (size <= MAX_SHORT_UNICHARS) {
4748 /* Write into the stack buffer; nallocated can't overflow.
4749 * At the end, we'll allocate exactly as much heap space as it
4750 * turns out we need.
4751 */
4752 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004753 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004754 p = stackbuf;
4755 }
4756 else {
4757 /* Overallocate on the heap, and give the excess back at the end. */
4758 nallocated = size * 4;
4759 if (nallocated / 4 != size) /* overflow! */
4760 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004761 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004762 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004763 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004764 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004765 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004766
Tim Peters602f7402002-04-27 18:03:26 +00004767 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004768 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004769
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004770 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004771 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004772 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004773
Guido van Rossumd57fd912000-03-10 22:53:23 +00004774 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004775 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004776 *p++ = (char)(0xc0 | (ch >> 6));
4777 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner551ac952011-11-29 22:58:13 +01004778 } else if (Py_UNICODE_IS_SURROGATE(ch)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004779 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004780 Py_ssize_t repsize, k, startpos;
4781 startpos = i-1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004782 rep = unicode_encode_call_errorhandler(
4783 errors, &errorHandler, "utf-8", "surrogates not allowed",
Victor Stinner7931d9a2011-11-04 00:22:48 +01004784 unicode, &exc, startpos, startpos+1, &newpos);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004785 if (!rep)
4786 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004787
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004788 if (PyBytes_Check(rep))
4789 repsize = PyBytes_GET_SIZE(rep);
4790 else
Victor Stinner9e30aa52011-11-21 02:49:52 +01004791 repsize = PyUnicode_GET_LENGTH(rep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004792
4793 if (repsize > 4) {
4794 Py_ssize_t offset;
4795
4796 if (result == NULL)
4797 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004798 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004799 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004800
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004801 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4802 /* integer overflow */
4803 PyErr_NoMemory();
4804 goto error;
4805 }
4806 nallocated += repsize - 4;
4807 if (result != NULL) {
4808 if (_PyBytes_Resize(&result, nallocated) < 0)
4809 goto error;
4810 } else {
4811 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004812 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004813 goto error;
4814 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4815 }
4816 p = PyBytes_AS_STRING(result) + offset;
4817 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004818
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004819 if (PyBytes_Check(rep)) {
4820 char *prep = PyBytes_AS_STRING(rep);
4821 for(k = repsize; k > 0; k--)
4822 *p++ = *prep++;
4823 } else /* rep is unicode */ {
Victor Stinnera98b28c2011-11-10 20:21:49 +01004824 enum PyUnicode_Kind repkind;
4825 void *repdata;
4826
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004827 if (PyUnicode_READY(rep) < 0)
Victor Stinnera98b28c2011-11-10 20:21:49 +01004828 goto error;
Victor Stinnera98b28c2011-11-10 20:21:49 +01004829 repkind = PyUnicode_KIND(rep);
4830 repdata = PyUnicode_DATA(rep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004831
4832 for(k=0; k<repsize; k++) {
Victor Stinnera98b28c2011-11-10 20:21:49 +01004833 Py_UCS4 c = PyUnicode_READ(repkind, repdata, k);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004834 if (0x80 <= c) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01004835 raise_encode_exception(&exc, "utf-8",
Victor Stinner7931d9a2011-11-04 00:22:48 +01004836 unicode,
Martin v. Löwis9e816682011-11-02 12:45:42 +01004837 i-1, i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004838 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004839 goto error;
4840 }
Victor Stinnera98b28c2011-11-10 20:21:49 +01004841 *p++ = (char)c;
Victor Stinner31be90b2010-04-22 19:38:16 +00004842 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004843 }
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004844 Py_CLEAR(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004845 } else if (ch < 0x10000) {
4846 *p++ = (char)(0xe0 | (ch >> 12));
4847 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4848 *p++ = (char)(0x80 | (ch & 0x3f));
4849 } else /* ch >= 0x10000 */ {
Victor Stinner8faf8212011-12-08 22:14:11 +01004850 assert(ch <= MAX_UNICODE);
Tim Peters602f7402002-04-27 18:03:26 +00004851 /* Encode UCS4 Unicode ordinals */
4852 *p++ = (char)(0xf0 | (ch >> 18));
4853 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4854 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4855 *p++ = (char)(0x80 | (ch & 0x3f));
4856 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004857 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004858
Guido van Rossum98297ee2007-11-06 21:34:58 +00004859 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004860 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004861 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004862 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004863 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004864 }
4865 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004866 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004867 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004868 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004869 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004870 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004871
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004872 Py_XDECREF(errorHandler);
4873 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004874 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004875 error:
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004876 Py_XDECREF(rep);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004877 Py_XDECREF(errorHandler);
4878 Py_XDECREF(exc);
4879 Py_XDECREF(result);
4880 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004881
Tim Peters602f7402002-04-27 18:03:26 +00004882#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004883}
4884
Alexander Belopolsky40018472011-02-26 01:02:56 +00004885PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004886PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4887 Py_ssize_t size,
4888 const char *errors)
4889{
4890 PyObject *v, *unicode;
4891
4892 unicode = PyUnicode_FromUnicode(s, size);
4893 if (unicode == NULL)
4894 return NULL;
4895 v = _PyUnicode_AsUTF8String(unicode, errors);
4896 Py_DECREF(unicode);
4897 return v;
4898}
4899
4900PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004901PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004902{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004903 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004904}
4905
Walter Dörwald41980ca2007-08-16 21:55:45 +00004906/* --- UTF-32 Codec ------------------------------------------------------- */
4907
4908PyObject *
4909PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004910 Py_ssize_t size,
4911 const char *errors,
4912 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004913{
4914 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4915}
4916
4917PyObject *
4918PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004919 Py_ssize_t size,
4920 const char *errors,
4921 int *byteorder,
4922 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004923{
4924 const char *starts = s;
4925 Py_ssize_t startinpos;
4926 Py_ssize_t endinpos;
4927 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004928 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004929 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004930 int bo = 0; /* assume native ordering by default */
4931 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004932 /* Offsets from q for retrieving bytes in the right order. */
4933#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4934 int iorder[] = {0, 1, 2, 3};
4935#else
4936 int iorder[] = {3, 2, 1, 0};
4937#endif
4938 PyObject *errorHandler = NULL;
4939 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004940
Walter Dörwald41980ca2007-08-16 21:55:45 +00004941 q = (unsigned char *)s;
4942 e = q + size;
4943
4944 if (byteorder)
4945 bo = *byteorder;
4946
4947 /* Check for BOM marks (U+FEFF) in the input and adjust current
4948 byte order setting accordingly. In native mode, the leading BOM
4949 mark is skipped, in all other modes, it is copied to the output
4950 stream as-is (giving a ZWNBSP character). */
4951 if (bo == 0) {
4952 if (size >= 4) {
4953 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004954 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004955#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004956 if (bom == 0x0000FEFF) {
4957 q += 4;
4958 bo = -1;
4959 }
4960 else if (bom == 0xFFFE0000) {
4961 q += 4;
4962 bo = 1;
4963 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004964#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004965 if (bom == 0x0000FEFF) {
4966 q += 4;
4967 bo = 1;
4968 }
4969 else if (bom == 0xFFFE0000) {
4970 q += 4;
4971 bo = -1;
4972 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004973#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004974 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004975 }
4976
4977 if (bo == -1) {
4978 /* force LE */
4979 iorder[0] = 0;
4980 iorder[1] = 1;
4981 iorder[2] = 2;
4982 iorder[3] = 3;
4983 }
4984 else if (bo == 1) {
4985 /* force BE */
4986 iorder[0] = 3;
4987 iorder[1] = 2;
4988 iorder[2] = 1;
4989 iorder[3] = 0;
4990 }
4991
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004992 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004993 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004994 if (!unicode)
4995 return NULL;
4996 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01004997 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004998 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004999
Walter Dörwald41980ca2007-08-16 21:55:45 +00005000 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005001 Py_UCS4 ch;
5002 /* remaining bytes at the end? (size should be divisible by 4) */
5003 if (e-q<4) {
5004 if (consumed)
5005 break;
5006 errmsg = "truncated data";
5007 startinpos = ((const char *)q)-starts;
5008 endinpos = ((const char *)e)-starts;
5009 goto utf32Error;
5010 /* The remaining input chars are ignored if the callback
5011 chooses to skip the input */
5012 }
5013 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5014 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005015
Benjamin Peterson29060642009-01-31 22:14:21 +00005016 if (ch >= 0x110000)
5017 {
5018 errmsg = "codepoint not in range(0x110000)";
5019 startinpos = ((const char *)q)-starts;
5020 endinpos = startinpos+4;
5021 goto utf32Error;
5022 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005023 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5024 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005025 q += 4;
5026 continue;
5027 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005028 if (unicode_decode_call_errorhandler(
5029 errors, &errorHandler,
5030 "utf32", errmsg,
5031 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005032 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005033 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005034 }
5035
5036 if (byteorder)
5037 *byteorder = bo;
5038
5039 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005040 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005041
5042 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005043 if (unicode_resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005044 goto onError;
5045
5046 Py_XDECREF(errorHandler);
5047 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005048 return unicode_result(unicode);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005049
Benjamin Peterson29060642009-01-31 22:14:21 +00005050 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005051 Py_DECREF(unicode);
5052 Py_XDECREF(errorHandler);
5053 Py_XDECREF(exc);
5054 return NULL;
5055}
5056
5057PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005058_PyUnicode_EncodeUTF32(PyObject *str,
5059 const char *errors,
5060 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005061{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005062 int kind;
5063 void *data;
5064 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005065 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005066 unsigned char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005067 Py_ssize_t nsize, bytesize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005068 /* Offsets from p for storing byte pairs in the right order. */
5069#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5070 int iorder[] = {0, 1, 2, 3};
5071#else
5072 int iorder[] = {3, 2, 1, 0};
5073#endif
5074
Benjamin Peterson29060642009-01-31 22:14:21 +00005075#define STORECHAR(CH) \
5076 do { \
5077 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5078 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5079 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5080 p[iorder[0]] = (CH) & 0xff; \
5081 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005082 } while(0)
5083
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005084 if (!PyUnicode_Check(str)) {
5085 PyErr_BadArgument();
5086 return NULL;
5087 }
5088 if (PyUnicode_READY(str) < 0)
5089 return NULL;
5090 kind = PyUnicode_KIND(str);
5091 data = PyUnicode_DATA(str);
5092 len = PyUnicode_GET_LENGTH(str);
5093
5094 nsize = len + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005095 bytesize = nsize * 4;
5096 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005097 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005098 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005099 if (v == NULL)
5100 return NULL;
5101
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005102 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005103 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005104 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005105 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005106 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005107
5108 if (byteorder == -1) {
5109 /* force LE */
5110 iorder[0] = 0;
5111 iorder[1] = 1;
5112 iorder[2] = 2;
5113 iorder[3] = 3;
5114 }
5115 else if (byteorder == 1) {
5116 /* force BE */
5117 iorder[0] = 3;
5118 iorder[1] = 2;
5119 iorder[2] = 1;
5120 iorder[3] = 0;
5121 }
5122
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005123 for (i = 0; i < len; i++)
5124 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005125
5126 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005127 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005128#undef STORECHAR
5129}
5130
Alexander Belopolsky40018472011-02-26 01:02:56 +00005131PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005132PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5133 Py_ssize_t size,
5134 const char *errors,
5135 int byteorder)
5136{
5137 PyObject *result;
5138 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5139 if (tmp == NULL)
5140 return NULL;
5141 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5142 Py_DECREF(tmp);
5143 return result;
5144}
5145
5146PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005147PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005148{
Victor Stinnerb960b342011-11-20 19:12:52 +01005149 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005150}
5151
Guido van Rossumd57fd912000-03-10 22:53:23 +00005152/* --- UTF-16 Codec ------------------------------------------------------- */
5153
Tim Peters772747b2001-08-09 22:21:55 +00005154PyObject *
5155PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005156 Py_ssize_t size,
5157 const char *errors,
5158 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005159{
Walter Dörwald69652032004-09-07 20:24:22 +00005160 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5161}
5162
Antoine Pitrouab868312009-01-10 15:40:25 +00005163/* Two masks for fast checking of whether a C 'long' may contain
5164 UTF16-encoded surrogate characters. This is an efficient heuristic,
5165 assuming that non-surrogate characters with a code point >= 0x8000 are
5166 rare in most input.
5167 FAST_CHAR_MASK is used when the input is in native byte ordering,
5168 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005169*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005170#if (SIZEOF_LONG == 8)
5171# define FAST_CHAR_MASK 0x8000800080008000L
5172# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
5173#elif (SIZEOF_LONG == 4)
5174# define FAST_CHAR_MASK 0x80008000L
5175# define SWAPPED_FAST_CHAR_MASK 0x00800080L
5176#else
5177# error C 'long' size should be either 4 or 8!
5178#endif
5179
Walter Dörwald69652032004-09-07 20:24:22 +00005180PyObject *
5181PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005182 Py_ssize_t size,
5183 const char *errors,
5184 int *byteorder,
5185 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005186{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005187 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005188 Py_ssize_t startinpos;
5189 Py_ssize_t endinpos;
5190 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005191 PyObject *unicode;
Antoine Pitrouab868312009-01-10 15:40:25 +00005192 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005193 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005194 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005195 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005196 /* Offsets from q for retrieving byte pairs in the right order. */
5197#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5198 int ihi = 1, ilo = 0;
5199#else
5200 int ihi = 0, ilo = 1;
5201#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005202 PyObject *errorHandler = NULL;
5203 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005204
5205 /* Note: size will always be longer than the resulting Unicode
5206 character count */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005207 unicode = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005208 if (!unicode)
5209 return NULL;
5210 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005211 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005212 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005213
Tim Peters772747b2001-08-09 22:21:55 +00005214 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005215 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005216
5217 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005218 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005219
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005220 /* Check for BOM marks (U+FEFF) in the input and adjust current
5221 byte order setting accordingly. In native mode, the leading BOM
5222 mark is skipped, in all other modes, it is copied to the output
5223 stream as-is (giving a ZWNBSP character). */
5224 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005225 if (size >= 2) {
Victor Stinner24729f32011-11-10 20:31:37 +01005226 const Py_UCS4 bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005227#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005228 if (bom == 0xFEFF) {
5229 q += 2;
5230 bo = -1;
5231 }
5232 else if (bom == 0xFFFE) {
5233 q += 2;
5234 bo = 1;
5235 }
Tim Petersced69f82003-09-16 20:30:58 +00005236#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005237 if (bom == 0xFEFF) {
5238 q += 2;
5239 bo = 1;
5240 }
5241 else if (bom == 0xFFFE) {
5242 q += 2;
5243 bo = -1;
5244 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005245#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005246 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005247 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005248
Tim Peters772747b2001-08-09 22:21:55 +00005249 if (bo == -1) {
5250 /* force LE */
5251 ihi = 1;
5252 ilo = 0;
5253 }
5254 else if (bo == 1) {
5255 /* force BE */
5256 ihi = 0;
5257 ilo = 1;
5258 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005259#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5260 native_ordering = ilo < ihi;
5261#else
5262 native_ordering = ilo > ihi;
5263#endif
Tim Peters772747b2001-08-09 22:21:55 +00005264
Antoine Pitrouab868312009-01-10 15:40:25 +00005265 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005266 while (q < e) {
Victor Stinner24729f32011-11-10 20:31:37 +01005267 Py_UCS4 ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005268 /* First check for possible aligned read of a C 'long'. Unaligned
5269 reads are more expensive, better to defer to another iteration. */
5270 if (!((size_t) q & LONG_PTR_MASK)) {
5271 /* Fast path for runs of non-surrogate chars. */
5272 register const unsigned char *_q = q;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005273 int kind = PyUnicode_KIND(unicode);
5274 void *data = PyUnicode_DATA(unicode);
5275 while (_q < aligned_end) {
5276 unsigned long block = * (unsigned long *) _q;
5277 unsigned short *pblock = (unsigned short*)&block;
5278 Py_UCS4 maxch;
5279 if (native_ordering) {
5280 /* Can use buffer directly */
5281 if (block & FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005282 break;
Antoine Pitrouab868312009-01-10 15:40:25 +00005283 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005284 else {
5285 /* Need to byte-swap */
5286 unsigned char *_p = (unsigned char*)pblock;
5287 if (block & SWAPPED_FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005288 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005289 _p[0] = _q[1];
5290 _p[1] = _q[0];
5291 _p[2] = _q[3];
5292 _p[3] = _q[2];
Antoine Pitrouab868312009-01-10 15:40:25 +00005293#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005294 _p[4] = _q[5];
5295 _p[5] = _q[4];
5296 _p[6] = _q[7];
5297 _p[7] = _q[6];
Antoine Pitrouab868312009-01-10 15:40:25 +00005298#endif
Antoine Pitrouab868312009-01-10 15:40:25 +00005299 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005300 maxch = Py_MAX(pblock[0], pblock[1]);
5301#if SIZEOF_LONG == 8
5302 maxch = Py_MAX(maxch, Py_MAX(pblock[2], pblock[3]));
5303#endif
5304 if (maxch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
5305 if (unicode_widen(&unicode, maxch) < 0)
5306 goto onError;
5307 kind = PyUnicode_KIND(unicode);
5308 data = PyUnicode_DATA(unicode);
5309 }
5310 PyUnicode_WRITE(kind, data, outpos++, pblock[0]);
5311 PyUnicode_WRITE(kind, data, outpos++, pblock[1]);
5312#if SIZEOF_LONG == 8
5313 PyUnicode_WRITE(kind, data, outpos++, pblock[2]);
5314 PyUnicode_WRITE(kind, data, outpos++, pblock[3]);
5315#endif
5316 _q += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00005317 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005318 q = _q;
5319 if (q >= e)
5320 break;
5321 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005322 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005323
Benjamin Peterson14339b62009-01-31 16:36:08 +00005324 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005325
Victor Stinner551ac952011-11-29 22:58:13 +01005326 if (!Py_UNICODE_IS_SURROGATE(ch)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005327 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5328 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005329 continue;
5330 }
5331
5332 /* UTF-16 code pair: */
5333 if (q > e) {
5334 errmsg = "unexpected end of data";
5335 startinpos = (((const char *)q) - 2) - starts;
5336 endinpos = ((const char *)e) + 1 - starts;
5337 goto utf16Error;
5338 }
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005339 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)) {
5340 Py_UCS4 ch2 = (q[ihi] << 8) | q[ilo];
Benjamin Peterson29060642009-01-31 22:14:21 +00005341 q += 2;
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005342 if (Py_UNICODE_IS_LOW_SURROGATE(ch2)) {
Victor Stinner62aa4d02011-11-09 00:03:45 +01005343 if (unicode_putchar(&unicode, &outpos,
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005344 Py_UNICODE_JOIN_SURROGATES(ch, ch2)) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005345 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005346 continue;
5347 }
5348 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005349 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005350 startinpos = (((const char *)q)-4)-starts;
5351 endinpos = startinpos+2;
5352 goto utf16Error;
5353 }
5354
Benjamin Peterson14339b62009-01-31 16:36:08 +00005355 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005356 errmsg = "illegal encoding";
5357 startinpos = (((const char *)q)-2)-starts;
5358 endinpos = startinpos+2;
5359 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005360
Benjamin Peterson29060642009-01-31 22:14:21 +00005361 utf16Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005362 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005363 errors,
5364 &errorHandler,
5365 "utf16", errmsg,
5366 &starts,
5367 (const char **)&e,
5368 &startinpos,
5369 &endinpos,
5370 &exc,
5371 (const char **)&q,
5372 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005373 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005374 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005375 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005376 /* remaining byte at the end? (size should be even) */
5377 if (e == q) {
5378 if (!consumed) {
5379 errmsg = "truncated data";
5380 startinpos = ((const char *)q) - starts;
5381 endinpos = ((const char *)e) + 1 - starts;
Antoine Pitrouab868312009-01-10 15:40:25 +00005382 if (unicode_decode_call_errorhandler(
5383 errors,
5384 &errorHandler,
5385 "utf16", errmsg,
5386 &starts,
5387 (const char **)&e,
5388 &startinpos,
5389 &endinpos,
5390 &exc,
5391 (const char **)&q,
5392 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005393 &outpos))
Antoine Pitrouab868312009-01-10 15:40:25 +00005394 goto onError;
5395 /* The remaining input chars are ignored if the callback
5396 chooses to skip the input */
5397 }
5398 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005399
5400 if (byteorder)
5401 *byteorder = bo;
5402
Walter Dörwald69652032004-09-07 20:24:22 +00005403 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005404 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005405
Guido van Rossumd57fd912000-03-10 22:53:23 +00005406 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005407 if (unicode_resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005408 goto onError;
5409
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005410 Py_XDECREF(errorHandler);
5411 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005412 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005413
Benjamin Peterson29060642009-01-31 22:14:21 +00005414 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005415 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005416 Py_XDECREF(errorHandler);
5417 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005418 return NULL;
5419}
5420
Antoine Pitrouab868312009-01-10 15:40:25 +00005421#undef FAST_CHAR_MASK
5422#undef SWAPPED_FAST_CHAR_MASK
5423
Tim Peters772747b2001-08-09 22:21:55 +00005424PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005425_PyUnicode_EncodeUTF16(PyObject *str,
5426 const char *errors,
5427 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005428{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005429 int kind;
5430 void *data;
5431 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005432 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005433 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005434 Py_ssize_t nsize, bytesize;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005435 Py_ssize_t i, pairs;
Tim Peters772747b2001-08-09 22:21:55 +00005436 /* Offsets from p for storing byte pairs in the right order. */
5437#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5438 int ihi = 1, ilo = 0;
5439#else
5440 int ihi = 0, ilo = 1;
5441#endif
5442
Benjamin Peterson29060642009-01-31 22:14:21 +00005443#define STORECHAR(CH) \
5444 do { \
5445 p[ihi] = ((CH) >> 8) & 0xff; \
5446 p[ilo] = (CH) & 0xff; \
5447 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005448 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005449
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005450 if (!PyUnicode_Check(str)) {
5451 PyErr_BadArgument();
5452 return NULL;
5453 }
5454 if (PyUnicode_READY(str) < 0)
5455 return NULL;
5456 kind = PyUnicode_KIND(str);
5457 data = PyUnicode_DATA(str);
5458 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005459
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005460 pairs = 0;
5461 if (kind == PyUnicode_4BYTE_KIND)
5462 for (i = 0; i < len; i++)
5463 if (PyUnicode_READ(kind, data, i) >= 0x10000)
5464 pairs++;
5465 /* 2 * (len + pairs + (byteorder == 0)) */
5466 if (len > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005467 return PyErr_NoMemory();
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005468 nsize = len + pairs + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005469 bytesize = nsize * 2;
5470 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005471 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005472 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005473 if (v == NULL)
5474 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005475
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005476 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005477 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005478 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005479 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005480 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005481
5482 if (byteorder == -1) {
5483 /* force LE */
5484 ihi = 1;
5485 ilo = 0;
5486 }
5487 else if (byteorder == 1) {
5488 /* force BE */
5489 ihi = 0;
5490 ilo = 1;
5491 }
5492
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005493 for (i = 0; i < len; i++) {
5494 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5495 Py_UCS4 ch2 = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +00005496 if (ch >= 0x10000) {
Victor Stinner551ac952011-11-29 22:58:13 +01005497 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
5498 ch = Py_UNICODE_HIGH_SURROGATE(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00005499 }
Tim Peters772747b2001-08-09 22:21:55 +00005500 STORECHAR(ch);
5501 if (ch2)
5502 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005503 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005504
5505 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005506 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005507#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005508}
5509
Alexander Belopolsky40018472011-02-26 01:02:56 +00005510PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005511PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5512 Py_ssize_t size,
5513 const char *errors,
5514 int byteorder)
5515{
5516 PyObject *result;
5517 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5518 if (tmp == NULL)
5519 return NULL;
5520 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5521 Py_DECREF(tmp);
5522 return result;
5523}
5524
5525PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005526PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005527{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005528 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005529}
5530
5531/* --- Unicode Escape Codec ----------------------------------------------- */
5532
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005533/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5534 if all the escapes in the string make it still a valid ASCII string.
5535 Returns -1 if any escapes were found which cause the string to
5536 pop out of ASCII range. Otherwise returns the length of the
5537 required buffer to hold the string.
5538 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005539static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005540length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5541{
5542 const unsigned char *p = (const unsigned char *)s;
5543 const unsigned char *end = p + size;
5544 Py_ssize_t length = 0;
5545
5546 if (size < 0)
5547 return -1;
5548
5549 for (; p < end; ++p) {
5550 if (*p > 127) {
5551 /* Non-ASCII */
5552 return -1;
5553 }
5554 else if (*p != '\\') {
5555 /* Normal character */
5556 ++length;
5557 }
5558 else {
5559 /* Backslash-escape, check next char */
5560 ++p;
5561 /* Escape sequence reaches till end of string or
5562 non-ASCII follow-up. */
5563 if (p >= end || *p > 127)
5564 return -1;
5565 switch (*p) {
5566 case '\n':
5567 /* backslash + \n result in zero characters */
5568 break;
5569 case '\\': case '\'': case '\"':
5570 case 'b': case 'f': case 't':
5571 case 'n': case 'r': case 'v': case 'a':
5572 ++length;
5573 break;
5574 case '0': case '1': case '2': case '3':
5575 case '4': case '5': case '6': case '7':
5576 case 'x': case 'u': case 'U': case 'N':
5577 /* these do not guarantee ASCII characters */
5578 return -1;
5579 default:
5580 /* count the backslash + the other character */
5581 length += 2;
5582 }
5583 }
5584 }
5585 return length;
5586}
5587
Fredrik Lundh06d12682001-01-24 07:59:11 +00005588static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005589
Alexander Belopolsky40018472011-02-26 01:02:56 +00005590PyObject *
5591PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005592 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005593 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005594{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005595 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005596 Py_ssize_t startinpos;
5597 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005598 int j;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005599 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005600 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005601 char* message;
5602 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005603 PyObject *errorHandler = NULL;
5604 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005605 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005606 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005607
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005608 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005609
5610 /* After length_of_escaped_ascii_string() there are two alternatives,
5611 either the string is pure ASCII with named escapes like \n, etc.
5612 and we determined it's exact size (common case)
5613 or it contains \x, \u, ... escape sequences. then we create a
5614 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005615 if (len >= 0) {
5616 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005617 if (!v)
5618 goto onError;
5619 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005620 }
5621 else {
5622 /* Escaped strings will always be longer than the resulting
5623 Unicode string, so we start with size here and then reduce the
5624 length after conversion to the true value.
5625 (but if the error callback returns a long replacement string
5626 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005627 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005628 if (!v)
5629 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005630 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005631 }
5632
Guido van Rossumd57fd912000-03-10 22:53:23 +00005633 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005634 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005635 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005636 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005637
Guido van Rossumd57fd912000-03-10 22:53:23 +00005638 while (s < end) {
5639 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005640 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005641 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005642
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005643 /* The only case in which i == ascii_length is a backslash
5644 followed by a newline. */
5645 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005646
Guido van Rossumd57fd912000-03-10 22:53:23 +00005647 /* Non-escape characters are interpreted as Unicode ordinals */
5648 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005649 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5650 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005651 continue;
5652 }
5653
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005654 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005655 /* \ - Escapes */
5656 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005657 c = *s++;
5658 if (s > end)
5659 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005660
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005661 /* The only case in which i == ascii_length is a backslash
5662 followed by a newline. */
5663 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005664
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005665 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005666
Benjamin Peterson29060642009-01-31 22:14:21 +00005667 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005668#define WRITECHAR(ch) \
5669 do { \
5670 if (unicode_putchar(&v, &i, ch) < 0) \
5671 goto onError; \
5672 }while(0)
5673
Guido van Rossumd57fd912000-03-10 22:53:23 +00005674 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005675 case '\\': WRITECHAR('\\'); break;
5676 case '\'': WRITECHAR('\''); break;
5677 case '\"': WRITECHAR('\"'); break;
5678 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005679 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005680 case 'f': WRITECHAR('\014'); break;
5681 case 't': WRITECHAR('\t'); break;
5682 case 'n': WRITECHAR('\n'); break;
5683 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005684 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005685 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005686 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005687 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005688
Benjamin Peterson29060642009-01-31 22:14:21 +00005689 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005690 case '0': case '1': case '2': case '3':
5691 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005692 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005693 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005694 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005695 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005696 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005697 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005698 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699 break;
5700
Benjamin Peterson29060642009-01-31 22:14:21 +00005701 /* hex escapes */
5702 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005703 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005704 digits = 2;
5705 message = "truncated \\xXX escape";
5706 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005707
Benjamin Peterson29060642009-01-31 22:14:21 +00005708 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005709 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005710 digits = 4;
5711 message = "truncated \\uXXXX escape";
5712 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005713
Benjamin Peterson29060642009-01-31 22:14:21 +00005714 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005715 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005716 digits = 8;
5717 message = "truncated \\UXXXXXXXX escape";
5718 hexescape:
5719 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005720 if (s+digits>end) {
5721 endinpos = size;
5722 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005723 errors, &errorHandler,
5724 "unicodeescape", "end of string in escape sequence",
5725 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005726 &v, &i))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005727 goto onError;
5728 goto nextByte;
5729 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005730 for (j = 0; j < digits; ++j) {
5731 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005732 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005733 endinpos = (s+j+1)-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005734 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005735 errors, &errorHandler,
5736 "unicodeescape", message,
5737 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005738 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005739 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005740 len = PyUnicode_GET_LENGTH(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005741 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005742 }
5743 chr = (chr<<4) & ~0xF;
5744 if (c >= '0' && c <= '9')
5745 chr += c - '0';
5746 else if (c >= 'a' && c <= 'f')
5747 chr += 10 + c - 'a';
5748 else
5749 chr += 10 + c - 'A';
5750 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005751 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005752 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005753 /* _decoding_error will have already written into the
5754 target buffer. */
5755 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005756 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005757 /* when we get here, chr is a 32-bit unicode character */
Victor Stinner8faf8212011-12-08 22:14:11 +01005758 if (chr <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005759 WRITECHAR(chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005760 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005761 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005762 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005763 errors, &errorHandler,
5764 "unicodeescape", "illegal Unicode character",
5765 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005766 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005767 goto onError;
5768 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005769 break;
5770
Benjamin Peterson29060642009-01-31 22:14:21 +00005771 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005772 case 'N':
5773 message = "malformed \\N character escape";
5774 if (ucnhash_CAPI == NULL) {
5775 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005776 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5777 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005778 if (ucnhash_CAPI == NULL)
5779 goto ucnhashError;
5780 }
5781 if (*s == '{') {
5782 const char *start = s+1;
5783 /* look for the closing brace */
5784 while (*s != '}' && s < end)
5785 s++;
5786 if (s > start && s < end && *s == '}') {
5787 /* found a name. look it up in the unicode database */
5788 message = "unknown Unicode character name";
5789 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005790 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005791 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005792 goto store;
5793 }
5794 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005795 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005796 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005797 errors, &errorHandler,
5798 "unicodeescape", message,
5799 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005800 &v, &i))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005801 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005802 break;
5803
5804 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005805 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005806 message = "\\ at end of string";
5807 s--;
5808 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005809 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005810 errors, &errorHandler,
5811 "unicodeescape", message,
5812 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005813 &v, &i))
Walter Dörwald8c077222002-03-25 11:16:18 +00005814 goto onError;
5815 }
5816 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005817 WRITECHAR('\\');
5818 WRITECHAR(s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005819 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005820 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005821 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005822 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005823 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005824 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005825#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005826
Victor Stinner16e6a802011-12-12 13:24:15 +01005827 if (unicode_resize(&v, i) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005828 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005829 Py_XDECREF(errorHandler);
5830 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005831 return unicode_result(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005832
Benjamin Peterson29060642009-01-31 22:14:21 +00005833 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005834 PyErr_SetString(
5835 PyExc_UnicodeError,
5836 "\\N escapes not supported (can't load unicodedata module)"
5837 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005838 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005839 Py_XDECREF(errorHandler);
5840 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005841 return NULL;
5842
Benjamin Peterson29060642009-01-31 22:14:21 +00005843 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005844 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005845 Py_XDECREF(errorHandler);
5846 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005847 return NULL;
5848}
5849
5850/* Return a Unicode-Escape string version of the Unicode object.
5851
5852 If quotes is true, the string is enclosed in u"" or u'' quotes as
5853 appropriate.
5854
5855*/
5856
Alexander Belopolsky40018472011-02-26 01:02:56 +00005857PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005858PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005859{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005860 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005861 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005862 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005863 int kind;
5864 void *data;
5865 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005866
Thomas Wouters89f507f2006-12-13 04:49:30 +00005867 /* Initial allocation is based on the longest-possible unichr
5868 escape.
5869
5870 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5871 unichr, so in this case it's the longest unichr escape. In
5872 narrow (UTF-16) builds this is five chars per source unichr
5873 since there are two unichrs in the surrogate pair, so in narrow
5874 (UTF-16) builds it's not the longest unichr escape.
5875
5876 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5877 so in the narrow (UTF-16) build case it's the longest unichr
5878 escape.
5879 */
5880
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005881 if (!PyUnicode_Check(unicode)) {
5882 PyErr_BadArgument();
5883 return NULL;
5884 }
5885 if (PyUnicode_READY(unicode) < 0)
5886 return NULL;
5887 len = PyUnicode_GET_LENGTH(unicode);
5888 kind = PyUnicode_KIND(unicode);
5889 data = PyUnicode_DATA(unicode);
5890 switch(kind) {
5891 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5892 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5893 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5894 }
5895
5896 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005897 return PyBytes_FromStringAndSize(NULL, 0);
5898
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005899 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005900 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005901
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005902 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005903 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005904 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005905 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005906 if (repr == NULL)
5907 return NULL;
5908
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005909 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005910
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005911 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005912 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005913
Walter Dörwald79e913e2007-05-12 11:08:06 +00005914 /* Escape backslashes */
5915 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916 *p++ = '\\';
5917 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005918 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005919 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005920
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005921 /* Map 21-bit characters to '\U00xxxxxx' */
5922 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005923 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005924 *p++ = '\\';
5925 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005926 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5927 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5928 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5929 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5930 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5931 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5932 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5933 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005934 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005935 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005936
Guido van Rossumd57fd912000-03-10 22:53:23 +00005937 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005938 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005939 *p++ = '\\';
5940 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005941 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5942 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5943 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5944 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005945 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005946
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005947 /* Map special whitespace to '\t', \n', '\r' */
5948 else if (ch == '\t') {
5949 *p++ = '\\';
5950 *p++ = 't';
5951 }
5952 else if (ch == '\n') {
5953 *p++ = '\\';
5954 *p++ = 'n';
5955 }
5956 else if (ch == '\r') {
5957 *p++ = '\\';
5958 *p++ = 'r';
5959 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005960
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005961 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005962 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005963 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005964 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005965 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5966 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005967 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005968
Guido van Rossumd57fd912000-03-10 22:53:23 +00005969 /* Copy everything else as-is */
5970 else
5971 *p++ = (char) ch;
5972 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005973
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005974 assert(p - PyBytes_AS_STRING(repr) > 0);
5975 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5976 return NULL;
5977 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005978}
5979
Alexander Belopolsky40018472011-02-26 01:02:56 +00005980PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005981PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5982 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005983{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005984 PyObject *result;
5985 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5986 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005987 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005988 result = PyUnicode_AsUnicodeEscapeString(tmp);
5989 Py_DECREF(tmp);
5990 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005991}
5992
5993/* --- Raw Unicode Escape Codec ------------------------------------------- */
5994
Alexander Belopolsky40018472011-02-26 01:02:56 +00005995PyObject *
5996PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005997 Py_ssize_t size,
5998 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005999{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006000 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006001 Py_ssize_t startinpos;
6002 Py_ssize_t endinpos;
6003 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006004 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006005 const char *end;
6006 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006007 PyObject *errorHandler = NULL;
6008 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006009
Guido van Rossumd57fd912000-03-10 22:53:23 +00006010 /* Escaped strings will always be longer than the resulting
6011 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006012 length after conversion to the true value. (But decoding error
6013 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006014 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006015 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006016 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006017 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006018 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006019 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006020 end = s + size;
6021 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006022 unsigned char c;
6023 Py_UCS4 x;
6024 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006025 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006026
Benjamin Peterson29060642009-01-31 22:14:21 +00006027 /* Non-escape characters are interpreted as Unicode ordinals */
6028 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006029 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6030 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006031 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006032 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006033 startinpos = s-starts;
6034
6035 /* \u-escapes are only interpreted iff the number of leading
6036 backslashes if odd */
6037 bs = s;
6038 for (;s < end;) {
6039 if (*s != '\\')
6040 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006041 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6042 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006043 }
6044 if (((s - bs) & 1) == 0 ||
6045 s >= end ||
6046 (*s != 'u' && *s != 'U')) {
6047 continue;
6048 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006049 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006050 count = *s=='u' ? 4 : 8;
6051 s++;
6052
6053 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006054 for (x = 0, i = 0; i < count; ++i, ++s) {
6055 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006056 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006057 endinpos = s-starts;
6058 if (unicode_decode_call_errorhandler(
6059 errors, &errorHandler,
6060 "rawunicodeescape", "truncated \\uXXXX",
6061 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006062 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006063 goto onError;
6064 goto nextByte;
6065 }
6066 x = (x<<4) & ~0xF;
6067 if (c >= '0' && c <= '9')
6068 x += c - '0';
6069 else if (c >= 'a' && c <= 'f')
6070 x += 10 + c - 'a';
6071 else
6072 x += 10 + c - 'A';
6073 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006074 if (x <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006075 if (unicode_putchar(&v, &outpos, x) < 0)
6076 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006077 } else {
6078 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006079 if (unicode_decode_call_errorhandler(
6080 errors, &errorHandler,
6081 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006082 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006083 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006084 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006085 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006086 nextByte:
6087 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006088 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006089 if (unicode_resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006090 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006091 Py_XDECREF(errorHandler);
6092 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006093 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00006094
Benjamin Peterson29060642009-01-31 22:14:21 +00006095 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006096 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006097 Py_XDECREF(errorHandler);
6098 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006099 return NULL;
6100}
6101
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006102
Alexander Belopolsky40018472011-02-26 01:02:56 +00006103PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006104PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006105{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006106 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006107 char *p;
6108 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006109 Py_ssize_t expandsize, pos;
6110 int kind;
6111 void *data;
6112 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006113
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006114 if (!PyUnicode_Check(unicode)) {
6115 PyErr_BadArgument();
6116 return NULL;
6117 }
6118 if (PyUnicode_READY(unicode) < 0)
6119 return NULL;
6120 kind = PyUnicode_KIND(unicode);
6121 data = PyUnicode_DATA(unicode);
6122 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006123 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6124 bytes, and 1 byte characters 4. */
6125 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006126
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006127 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006128 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006129
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006130 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006131 if (repr == NULL)
6132 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006133 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006134 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006135
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006136 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006137 for (pos = 0; pos < len; pos++) {
6138 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006139 /* Map 32-bit characters to '\Uxxxxxxxx' */
6140 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006141 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006142 *p++ = '\\';
6143 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006144 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6145 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6146 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6147 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6148 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6149 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6150 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6151 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006152 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006153 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006154 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006155 *p++ = '\\';
6156 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006157 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6158 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6159 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6160 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006161 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006162 /* Copy everything else as-is */
6163 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006164 *p++ = (char) ch;
6165 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006166
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006167 assert(p > q);
6168 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006169 return NULL;
6170 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006171}
6172
Alexander Belopolsky40018472011-02-26 01:02:56 +00006173PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006174PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6175 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006176{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006177 PyObject *result;
6178 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6179 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006180 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006181 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6182 Py_DECREF(tmp);
6183 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006184}
6185
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006186/* --- Unicode Internal Codec ------------------------------------------- */
6187
Alexander Belopolsky40018472011-02-26 01:02:56 +00006188PyObject *
6189_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006190 Py_ssize_t size,
6191 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006192{
6193 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006194 Py_ssize_t startinpos;
6195 Py_ssize_t endinpos;
6196 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006197 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006198 const char *end;
6199 const char *reason;
6200 PyObject *errorHandler = NULL;
6201 PyObject *exc = NULL;
6202
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006203 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006204 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006205 1))
6206 return NULL;
6207
Thomas Wouters89f507f2006-12-13 04:49:30 +00006208 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006209 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006210 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006211 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006212 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006213 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006214 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006215 end = s + size;
6216
6217 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006218 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006219 Py_UCS4 ch;
6220 /* We copy the raw representation one byte at a time because the
6221 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006222 ((char *) &uch)[0] = s[0];
6223 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006224#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006225 ((char *) &uch)[2] = s[2];
6226 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006227#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006228 ch = uch;
6229
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006230 /* We have to sanity check the raw data, otherwise doom looms for
6231 some malformed UCS-4 data. */
6232 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006233#ifdef Py_UNICODE_WIDE
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006234 ch > 0x10ffff ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006235#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006236 end-s < Py_UNICODE_SIZE
6237 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006238 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006239 startinpos = s - starts;
6240 if (end-s < Py_UNICODE_SIZE) {
6241 endinpos = end-starts;
6242 reason = "truncated input";
6243 }
6244 else {
6245 endinpos = s - starts + Py_UNICODE_SIZE;
6246 reason = "illegal code point (> 0x10FFFF)";
6247 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006248 if (unicode_decode_call_errorhandler(
6249 errors, &errorHandler,
6250 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006251 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006252 &v, &outpos))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006253 goto onError;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006254 continue;
6255 }
6256
6257 s += Py_UNICODE_SIZE;
6258#ifndef Py_UNICODE_WIDE
Victor Stinner551ac952011-11-29 22:58:13 +01006259 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006260 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006261 Py_UNICODE uch2;
6262 ((char *) &uch2)[0] = s[0];
6263 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006264 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006265 {
Victor Stinner551ac952011-11-29 22:58:13 +01006266 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006267 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006268 }
6269 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006270#endif
6271
6272 if (unicode_putchar(&v, &outpos, ch) < 0)
6273 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006274 }
6275
Victor Stinner16e6a802011-12-12 13:24:15 +01006276 if (unicode_resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006277 goto onError;
6278 Py_XDECREF(errorHandler);
6279 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006280 return unicode_result(v);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006281
Benjamin Peterson29060642009-01-31 22:14:21 +00006282 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006283 Py_XDECREF(v);
6284 Py_XDECREF(errorHandler);
6285 Py_XDECREF(exc);
6286 return NULL;
6287}
6288
Guido van Rossumd57fd912000-03-10 22:53:23 +00006289/* --- Latin-1 Codec ------------------------------------------------------ */
6290
Alexander Belopolsky40018472011-02-26 01:02:56 +00006291PyObject *
6292PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006293 Py_ssize_t size,
6294 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006295{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006296 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006297 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006298}
6299
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006300/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006301static void
6302make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006303 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006304 PyObject *unicode,
6305 Py_ssize_t startpos, Py_ssize_t endpos,
6306 const char *reason)
6307{
6308 if (*exceptionObject == NULL) {
6309 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006310 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006311 encoding, unicode, startpos, endpos, reason);
6312 }
6313 else {
6314 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6315 goto onError;
6316 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6317 goto onError;
6318 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6319 goto onError;
6320 return;
6321 onError:
6322 Py_DECREF(*exceptionObject);
6323 *exceptionObject = NULL;
6324 }
6325}
6326
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006327/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006328static void
6329raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006330 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006331 PyObject *unicode,
6332 Py_ssize_t startpos, Py_ssize_t endpos,
6333 const char *reason)
6334{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006335 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006336 encoding, unicode, startpos, endpos, reason);
6337 if (*exceptionObject != NULL)
6338 PyCodec_StrictErrors(*exceptionObject);
6339}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006340
6341/* error handling callback helper:
6342 build arguments, call the callback and check the arguments,
6343 put the result into newpos and return the replacement string, which
6344 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006345static PyObject *
6346unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006347 PyObject **errorHandler,
6348 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006349 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006350 Py_ssize_t startpos, Py_ssize_t endpos,
6351 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006352{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006353 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006354 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006355 PyObject *restuple;
6356 PyObject *resunicode;
6357
6358 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006359 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006360 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006361 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006362 }
6363
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006364 if (PyUnicode_READY(unicode) < 0)
6365 return NULL;
6366 len = PyUnicode_GET_LENGTH(unicode);
6367
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006368 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006369 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006370 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006371 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006372
6373 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006374 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006375 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006376 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006377 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006378 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006379 Py_DECREF(restuple);
6380 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006381 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006382 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006383 &resunicode, newpos)) {
6384 Py_DECREF(restuple);
6385 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006386 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006387 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6388 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6389 Py_DECREF(restuple);
6390 return NULL;
6391 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006392 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006393 *newpos = len + *newpos;
6394 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006395 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6396 Py_DECREF(restuple);
6397 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006398 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006399 Py_INCREF(resunicode);
6400 Py_DECREF(restuple);
6401 return resunicode;
6402}
6403
Alexander Belopolsky40018472011-02-26 01:02:56 +00006404static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006405unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006406 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006407 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006408{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006409 /* input state */
6410 Py_ssize_t pos=0, size;
6411 int kind;
6412 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006413 /* output object */
6414 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006415 /* pointer into the output */
6416 char *str;
6417 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006418 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006419 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6420 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006421 PyObject *errorHandler = NULL;
6422 PyObject *exc = NULL;
6423 /* the following variable is used for caching string comparisons
6424 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6425 int known_errorHandler = -1;
6426
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006427 if (PyUnicode_READY(unicode) < 0)
6428 return NULL;
6429 size = PyUnicode_GET_LENGTH(unicode);
6430 kind = PyUnicode_KIND(unicode);
6431 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006432 /* allocate enough for a simple encoding without
6433 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006434 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006435 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006436 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006437 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006438 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006439 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006440 ressize = size;
6441
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006442 while (pos < size) {
6443 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006444
Benjamin Peterson29060642009-01-31 22:14:21 +00006445 /* can we encode this? */
6446 if (c<limit) {
6447 /* no overflow check, because we know that the space is enough */
6448 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006449 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006450 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006451 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006452 Py_ssize_t requiredsize;
6453 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006454 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006455 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006456 Py_ssize_t collstart = pos;
6457 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006458 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006459 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006460 ++collend;
6461 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6462 if (known_errorHandler==-1) {
6463 if ((errors==NULL) || (!strcmp(errors, "strict")))
6464 known_errorHandler = 1;
6465 else if (!strcmp(errors, "replace"))
6466 known_errorHandler = 2;
6467 else if (!strcmp(errors, "ignore"))
6468 known_errorHandler = 3;
6469 else if (!strcmp(errors, "xmlcharrefreplace"))
6470 known_errorHandler = 4;
6471 else
6472 known_errorHandler = 0;
6473 }
6474 switch (known_errorHandler) {
6475 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006476 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006477 goto onError;
6478 case 2: /* replace */
6479 while (collstart++<collend)
6480 *str++ = '?'; /* fall through */
6481 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006482 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006483 break;
6484 case 4: /* xmlcharrefreplace */
6485 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006486 /* determine replacement size */
6487 for (i = collstart, repsize = 0; i < collend; ++i) {
6488 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6489 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006490 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006491 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006492 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006493 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006494 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006495 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006496 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006497 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006498 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006499 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006500 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006501 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006502 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006503 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006504 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006505 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006506 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006507 if (requiredsize > ressize) {
6508 if (requiredsize<2*ressize)
6509 requiredsize = 2*ressize;
6510 if (_PyBytes_Resize(&res, requiredsize))
6511 goto onError;
6512 str = PyBytes_AS_STRING(res) + respos;
6513 ressize = requiredsize;
6514 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006515 /* generate replacement */
6516 for (i = collstart; i < collend; ++i) {
6517 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006518 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006519 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006520 break;
6521 default:
6522 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006523 encoding, reason, unicode, &exc,
6524 collstart, collend, &newpos);
6525 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
6526 PyUnicode_READY(repunicode) < 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00006527 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006528 if (PyBytes_Check(repunicode)) {
6529 /* Directly copy bytes result to output. */
6530 repsize = PyBytes_Size(repunicode);
6531 if (repsize > 1) {
6532 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006533 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006534 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6535 Py_DECREF(repunicode);
6536 goto onError;
6537 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006538 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006539 ressize += repsize-1;
6540 }
6541 memcpy(str, PyBytes_AsString(repunicode), repsize);
6542 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006543 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006544 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006545 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006546 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006547 /* need more space? (at least enough for what we
6548 have+the replacement+the rest of the string, so
6549 we won't have to check space for encodable characters) */
6550 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006551 repsize = PyUnicode_GET_LENGTH(repunicode);
6552 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006553 if (requiredsize > ressize) {
6554 if (requiredsize<2*ressize)
6555 requiredsize = 2*ressize;
6556 if (_PyBytes_Resize(&res, requiredsize)) {
6557 Py_DECREF(repunicode);
6558 goto onError;
6559 }
6560 str = PyBytes_AS_STRING(res) + respos;
6561 ressize = requiredsize;
6562 }
6563 /* check if there is anything unencodable in the replacement
6564 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006565 for (i = 0; repsize-->0; ++i, ++str) {
6566 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006567 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006568 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006569 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006570 Py_DECREF(repunicode);
6571 goto onError;
6572 }
6573 *str = (char)c;
6574 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006575 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006576 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006577 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006578 }
6579 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006580 /* Resize if we allocated to much */
6581 size = str - PyBytes_AS_STRING(res);
6582 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006583 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006584 if (_PyBytes_Resize(&res, size) < 0)
6585 goto onError;
6586 }
6587
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006588 Py_XDECREF(errorHandler);
6589 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006590 return res;
6591
6592 onError:
6593 Py_XDECREF(res);
6594 Py_XDECREF(errorHandler);
6595 Py_XDECREF(exc);
6596 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006597}
6598
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006599/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006600PyObject *
6601PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006602 Py_ssize_t size,
6603 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006604{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006605 PyObject *result;
6606 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6607 if (unicode == NULL)
6608 return NULL;
6609 result = unicode_encode_ucs1(unicode, errors, 256);
6610 Py_DECREF(unicode);
6611 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006612}
6613
Alexander Belopolsky40018472011-02-26 01:02:56 +00006614PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006615_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006616{
6617 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006618 PyErr_BadArgument();
6619 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006620 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006621 if (PyUnicode_READY(unicode) == -1)
6622 return NULL;
6623 /* Fast path: if it is a one-byte string, construct
6624 bytes object directly. */
6625 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6626 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6627 PyUnicode_GET_LENGTH(unicode));
6628 /* Non-Latin-1 characters present. Defer to above function to
6629 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006630 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006631}
6632
6633PyObject*
6634PyUnicode_AsLatin1String(PyObject *unicode)
6635{
6636 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006637}
6638
6639/* --- 7-bit ASCII Codec -------------------------------------------------- */
6640
Alexander Belopolsky40018472011-02-26 01:02:56 +00006641PyObject *
6642PyUnicode_DecodeASCII(const char *s,
6643 Py_ssize_t size,
6644 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006645{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006646 const char *starts = s;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006647 PyObject *v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006648 int kind;
6649 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006650 Py_ssize_t startinpos;
6651 Py_ssize_t endinpos;
6652 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006653 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006654 int has_error;
6655 const unsigned char *p = (const unsigned char *)s;
6656 const unsigned char *end = p + size;
6657 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006658 PyObject *errorHandler = NULL;
6659 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006660
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006661 if (size == 0) {
6662 Py_INCREF(unicode_empty);
6663 return unicode_empty;
6664 }
6665
Guido van Rossumd57fd912000-03-10 22:53:23 +00006666 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006667 if (size == 1 && (unsigned char)s[0] < 128)
6668 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006669
Victor Stinner702c7342011-10-05 13:50:52 +02006670 has_error = 0;
6671 while (p < end && !has_error) {
6672 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6673 an explanation. */
6674 if (!((size_t) p & LONG_PTR_MASK)) {
6675 /* Help register allocation */
6676 register const unsigned char *_p = p;
6677 while (_p < aligned_end) {
6678 unsigned long value = *(unsigned long *) _p;
6679 if (value & ASCII_CHAR_MASK) {
6680 has_error = 1;
6681 break;
6682 }
6683 _p += SIZEOF_LONG;
6684 }
6685 if (_p == end)
6686 break;
6687 if (has_error)
6688 break;
6689 p = _p;
6690 }
6691 if (*p & 0x80) {
6692 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006693 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006694 }
6695 else {
6696 ++p;
6697 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006698 }
Victor Stinner702c7342011-10-05 13:50:52 +02006699 if (!has_error)
6700 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006701
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006702 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006703 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006704 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006705 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006706 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006707 kind = PyUnicode_KIND(v);
6708 data = PyUnicode_DATA(v);
6709 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006710 e = s + size;
6711 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006712 register unsigned char c = (unsigned char)*s;
6713 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006714 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006715 ++s;
6716 }
6717 else {
6718 startinpos = s-starts;
6719 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006720 if (unicode_decode_call_errorhandler(
6721 errors, &errorHandler,
6722 "ascii", "ordinal not in range(128)",
6723 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006724 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006725 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006726 kind = PyUnicode_KIND(v);
6727 data = PyUnicode_DATA(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006728 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006729 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006730 if (unicode_resize(&v, outpos) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006731 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006732 Py_XDECREF(errorHandler);
6733 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006734 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01006735 return v;
Tim Petersced69f82003-09-16 20:30:58 +00006736
Benjamin Peterson29060642009-01-31 22:14:21 +00006737 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006738 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006739 Py_XDECREF(errorHandler);
6740 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006741 return NULL;
6742}
6743
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006744/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006745PyObject *
6746PyUnicode_EncodeASCII(const Py_UNICODE *p,
6747 Py_ssize_t size,
6748 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006749{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006750 PyObject *result;
6751 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6752 if (unicode == NULL)
6753 return NULL;
6754 result = unicode_encode_ucs1(unicode, errors, 128);
6755 Py_DECREF(unicode);
6756 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006757}
6758
Alexander Belopolsky40018472011-02-26 01:02:56 +00006759PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006760_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006761{
6762 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006763 PyErr_BadArgument();
6764 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006765 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006766 if (PyUnicode_READY(unicode) == -1)
6767 return NULL;
6768 /* Fast path: if it is an ASCII-only string, construct bytes object
6769 directly. Else defer to above function to raise the exception. */
6770 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6771 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6772 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006773 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006774}
6775
6776PyObject *
6777PyUnicode_AsASCIIString(PyObject *unicode)
6778{
6779 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006780}
6781
Victor Stinner99b95382011-07-04 14:23:54 +02006782#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006783
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006784/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006785
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006786#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006787#define NEED_RETRY
6788#endif
6789
Victor Stinner3a50e702011-10-18 21:21:00 +02006790#ifndef WC_ERR_INVALID_CHARS
6791# define WC_ERR_INVALID_CHARS 0x0080
6792#endif
6793
6794static char*
6795code_page_name(UINT code_page, PyObject **obj)
6796{
6797 *obj = NULL;
6798 if (code_page == CP_ACP)
6799 return "mbcs";
6800 if (code_page == CP_UTF7)
6801 return "CP_UTF7";
6802 if (code_page == CP_UTF8)
6803 return "CP_UTF8";
6804
6805 *obj = PyBytes_FromFormat("cp%u", code_page);
6806 if (*obj == NULL)
6807 return NULL;
6808 return PyBytes_AS_STRING(*obj);
6809}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006810
Alexander Belopolsky40018472011-02-26 01:02:56 +00006811static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006812is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006813{
6814 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006815 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006816
Victor Stinner3a50e702011-10-18 21:21:00 +02006817 if (!IsDBCSLeadByteEx(code_page, *curr))
6818 return 0;
6819
6820 prev = CharPrevExA(code_page, s, curr, 0);
6821 if (prev == curr)
6822 return 1;
6823 /* FIXME: This code is limited to "true" double-byte encodings,
6824 as it assumes an incomplete character consists of a single
6825 byte. */
6826 if (curr - prev == 2)
6827 return 1;
6828 if (!IsDBCSLeadByteEx(code_page, *prev))
6829 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006830 return 0;
6831}
6832
Victor Stinner3a50e702011-10-18 21:21:00 +02006833static DWORD
6834decode_code_page_flags(UINT code_page)
6835{
6836 if (code_page == CP_UTF7) {
6837 /* The CP_UTF7 decoder only supports flags=0 */
6838 return 0;
6839 }
6840 else
6841 return MB_ERR_INVALID_CHARS;
6842}
6843
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006844/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006845 * Decode a byte string from a Windows code page into unicode object in strict
6846 * mode.
6847 *
6848 * Returns consumed size if succeed, returns -2 on decode error, or raise a
6849 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006850 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006851static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006852decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006853 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006854 const char *in,
6855 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006856{
Victor Stinner3a50e702011-10-18 21:21:00 +02006857 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006858 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006859 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006860
6861 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006862 assert(insize > 0);
6863 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6864 if (outsize <= 0)
6865 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006866
6867 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006868 /* Create unicode object */
Victor Stinner76a31a62011-11-04 00:05:13 +01006869 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006870 if (*v == NULL)
6871 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006872 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006873 }
6874 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006875 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006876 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01006877 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006878 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006879 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006880 }
6881
6882 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006883 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6884 if (outsize <= 0)
6885 goto error;
6886 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006887
Victor Stinner3a50e702011-10-18 21:21:00 +02006888error:
6889 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6890 return -2;
6891 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006892 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006893}
6894
Victor Stinner3a50e702011-10-18 21:21:00 +02006895/*
6896 * Decode a byte string from a code page into unicode object with an error
6897 * handler.
6898 *
6899 * Returns consumed size if succeed, or raise a WindowsError or
6900 * UnicodeDecodeError exception and returns -1 on error.
6901 */
6902static int
6903decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006904 PyObject **v,
6905 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006906 const char *errors)
6907{
6908 const char *startin = in;
6909 const char *endin = in + size;
6910 const DWORD flags = decode_code_page_flags(code_page);
6911 /* Ideally, we should get reason from FormatMessage. This is the Windows
6912 2000 English version of the message. */
6913 const char *reason = "No mapping for the Unicode character exists "
6914 "in the target code page.";
6915 /* each step cannot decode more than 1 character, but a character can be
6916 represented as a surrogate pair */
6917 wchar_t buffer[2], *startout, *out;
6918 int insize, outsize;
6919 PyObject *errorHandler = NULL;
6920 PyObject *exc = NULL;
6921 PyObject *encoding_obj = NULL;
6922 char *encoding;
6923 DWORD err;
6924 int ret = -1;
6925
6926 assert(size > 0);
6927
6928 encoding = code_page_name(code_page, &encoding_obj);
6929 if (encoding == NULL)
6930 return -1;
6931
6932 if (errors == NULL || strcmp(errors, "strict") == 0) {
6933 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6934 UnicodeDecodeError. */
6935 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6936 if (exc != NULL) {
6937 PyCodec_StrictErrors(exc);
6938 Py_CLEAR(exc);
6939 }
6940 goto error;
6941 }
6942
6943 if (*v == NULL) {
6944 /* Create unicode object */
6945 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6946 PyErr_NoMemory();
6947 goto error;
6948 }
Victor Stinner76a31a62011-11-04 00:05:13 +01006949 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006950 if (*v == NULL)
6951 goto error;
6952 startout = PyUnicode_AS_UNICODE(*v);
6953 }
6954 else {
6955 /* Extend unicode object */
6956 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6957 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6958 PyErr_NoMemory();
6959 goto error;
6960 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006961 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006962 goto error;
6963 startout = PyUnicode_AS_UNICODE(*v) + n;
6964 }
6965
6966 /* Decode the byte string character per character */
6967 out = startout;
6968 while (in < endin)
6969 {
6970 /* Decode a character */
6971 insize = 1;
6972 do
6973 {
6974 outsize = MultiByteToWideChar(code_page, flags,
6975 in, insize,
6976 buffer, Py_ARRAY_LENGTH(buffer));
6977 if (outsize > 0)
6978 break;
6979 err = GetLastError();
6980 if (err != ERROR_NO_UNICODE_TRANSLATION
6981 && err != ERROR_INSUFFICIENT_BUFFER)
6982 {
6983 PyErr_SetFromWindowsErr(0);
6984 goto error;
6985 }
6986 insize++;
6987 }
6988 /* 4=maximum length of a UTF-8 sequence */
6989 while (insize <= 4 && (in + insize) <= endin);
6990
6991 if (outsize <= 0) {
6992 Py_ssize_t startinpos, endinpos, outpos;
6993
6994 startinpos = in - startin;
6995 endinpos = startinpos + 1;
6996 outpos = out - PyUnicode_AS_UNICODE(*v);
6997 if (unicode_decode_call_errorhandler(
6998 errors, &errorHandler,
6999 encoding, reason,
7000 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007001 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007002 {
7003 goto error;
7004 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007005 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007006 }
7007 else {
7008 in += insize;
7009 memcpy(out, buffer, outsize * sizeof(wchar_t));
7010 out += outsize;
7011 }
7012 }
7013
7014 /* write a NUL character at the end */
7015 *out = 0;
7016
7017 /* Extend unicode object */
7018 outsize = out - startout;
7019 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007020 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007021 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01007022 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007023
7024error:
7025 Py_XDECREF(encoding_obj);
7026 Py_XDECREF(errorHandler);
7027 Py_XDECREF(exc);
7028 return ret;
7029}
7030
Victor Stinner3a50e702011-10-18 21:21:00 +02007031static PyObject *
7032decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007033 const char *s, Py_ssize_t size,
7034 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007035{
Victor Stinner76a31a62011-11-04 00:05:13 +01007036 PyObject *v = NULL;
7037 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007038
Victor Stinner3a50e702011-10-18 21:21:00 +02007039 if (code_page < 0) {
7040 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7041 return NULL;
7042 }
7043
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007044 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007045 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007046
Victor Stinner76a31a62011-11-04 00:05:13 +01007047 do
7048 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007049#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007050 if (size > INT_MAX) {
7051 chunk_size = INT_MAX;
7052 final = 0;
7053 done = 0;
7054 }
7055 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007056#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007057 {
7058 chunk_size = (int)size;
7059 final = (consumed == NULL);
7060 done = 1;
7061 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007062
Victor Stinner76a31a62011-11-04 00:05:13 +01007063 /* Skip trailing lead-byte unless 'final' is set */
7064 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
7065 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007066
Victor Stinner76a31a62011-11-04 00:05:13 +01007067 if (chunk_size == 0 && done) {
7068 if (v != NULL)
7069 break;
7070 Py_INCREF(unicode_empty);
7071 return unicode_empty;
7072 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007073
Victor Stinner76a31a62011-11-04 00:05:13 +01007074
7075 converted = decode_code_page_strict(code_page, &v,
7076 s, chunk_size);
7077 if (converted == -2)
7078 converted = decode_code_page_errors(code_page, &v,
7079 s, chunk_size,
7080 errors);
7081 assert(converted != 0);
7082
7083 if (converted < 0) {
7084 Py_XDECREF(v);
7085 return NULL;
7086 }
7087
7088 if (consumed)
7089 *consumed += converted;
7090
7091 s += converted;
7092 size -= converted;
7093 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007094
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007095 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007096}
7097
Alexander Belopolsky40018472011-02-26 01:02:56 +00007098PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007099PyUnicode_DecodeCodePageStateful(int code_page,
7100 const char *s,
7101 Py_ssize_t size,
7102 const char *errors,
7103 Py_ssize_t *consumed)
7104{
7105 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7106}
7107
7108PyObject *
7109PyUnicode_DecodeMBCSStateful(const char *s,
7110 Py_ssize_t size,
7111 const char *errors,
7112 Py_ssize_t *consumed)
7113{
7114 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7115}
7116
7117PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007118PyUnicode_DecodeMBCS(const char *s,
7119 Py_ssize_t size,
7120 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007121{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007122 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7123}
7124
Victor Stinner3a50e702011-10-18 21:21:00 +02007125static DWORD
7126encode_code_page_flags(UINT code_page, const char *errors)
7127{
7128 if (code_page == CP_UTF8) {
7129 if (winver.dwMajorVersion >= 6)
7130 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7131 and later */
7132 return WC_ERR_INVALID_CHARS;
7133 else
7134 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7135 return 0;
7136 }
7137 else if (code_page == CP_UTF7) {
7138 /* CP_UTF7 only supports flags=0 */
7139 return 0;
7140 }
7141 else {
7142 if (errors != NULL && strcmp(errors, "replace") == 0)
7143 return 0;
7144 else
7145 return WC_NO_BEST_FIT_CHARS;
7146 }
7147}
7148
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007149/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007150 * Encode a Unicode string to a Windows code page into a byte string in strict
7151 * mode.
7152 *
7153 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7154 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007155 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007156static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007157encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007158 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007159 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007160{
Victor Stinner554f3f02010-06-16 23:33:54 +00007161 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007162 BOOL *pusedDefaultChar = &usedDefaultChar;
7163 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007164 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007165 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007166 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007167 const DWORD flags = encode_code_page_flags(code_page, NULL);
7168 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007169 /* Create a substring so that we can get the UTF-16 representation
7170 of just the slice under consideration. */
7171 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007172
Martin v. Löwis3d325192011-11-04 18:23:06 +01007173 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007174
Victor Stinner3a50e702011-10-18 21:21:00 +02007175 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007176 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007177 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007178 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007179
Victor Stinner2fc507f2011-11-04 20:06:39 +01007180 substring = PyUnicode_Substring(unicode, offset, offset+len);
7181 if (substring == NULL)
7182 return -1;
7183 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7184 if (p == NULL) {
7185 Py_DECREF(substring);
7186 return -1;
7187 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01007188
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007189 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007190 outsize = WideCharToMultiByte(code_page, flags,
7191 p, size,
7192 NULL, 0,
7193 NULL, pusedDefaultChar);
7194 if (outsize <= 0)
7195 goto error;
7196 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007197 if (pusedDefaultChar && *pusedDefaultChar) {
7198 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007199 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007200 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007201
Victor Stinner3a50e702011-10-18 21:21:00 +02007202 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007203 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007204 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007205 if (*outbytes == NULL) {
7206 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007207 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007208 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007209 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007210 }
7211 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007212 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007213 const Py_ssize_t n = PyBytes_Size(*outbytes);
7214 if (outsize > PY_SSIZE_T_MAX - n) {
7215 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007216 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007217 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007218 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007219 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7220 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007221 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007222 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007223 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007224 }
7225
7226 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007227 outsize = WideCharToMultiByte(code_page, flags,
7228 p, size,
7229 out, outsize,
7230 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007231 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007232 if (outsize <= 0)
7233 goto error;
7234 if (pusedDefaultChar && *pusedDefaultChar)
7235 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007236 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007237
Victor Stinner3a50e702011-10-18 21:21:00 +02007238error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007239 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007240 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7241 return -2;
7242 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007243 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007244}
7245
Victor Stinner3a50e702011-10-18 21:21:00 +02007246/*
7247 * Encode a Unicode string to a Windows code page into a byte string using a
7248 * error handler.
7249 *
7250 * Returns consumed characters if succeed, or raise a WindowsError and returns
7251 * -1 on other error.
7252 */
7253static int
7254encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007255 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007256 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007257{
Victor Stinner3a50e702011-10-18 21:21:00 +02007258 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007259 Py_ssize_t pos = unicode_offset;
7260 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007261 /* Ideally, we should get reason from FormatMessage. This is the Windows
7262 2000 English version of the message. */
7263 const char *reason = "invalid character";
7264 /* 4=maximum length of a UTF-8 sequence */
7265 char buffer[4];
7266 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7267 Py_ssize_t outsize;
7268 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007269 PyObject *errorHandler = NULL;
7270 PyObject *exc = NULL;
7271 PyObject *encoding_obj = NULL;
7272 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007273 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007274 PyObject *rep;
7275 int ret = -1;
7276
7277 assert(insize > 0);
7278
7279 encoding = code_page_name(code_page, &encoding_obj);
7280 if (encoding == NULL)
7281 return -1;
7282
7283 if (errors == NULL || strcmp(errors, "strict") == 0) {
7284 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7285 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007286 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007287 if (exc != NULL) {
7288 PyCodec_StrictErrors(exc);
7289 Py_DECREF(exc);
7290 }
7291 Py_XDECREF(encoding_obj);
7292 return -1;
7293 }
7294
7295 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7296 pusedDefaultChar = &usedDefaultChar;
7297 else
7298 pusedDefaultChar = NULL;
7299
7300 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7301 PyErr_NoMemory();
7302 goto error;
7303 }
7304 outsize = insize * Py_ARRAY_LENGTH(buffer);
7305
7306 if (*outbytes == NULL) {
7307 /* Create string object */
7308 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7309 if (*outbytes == NULL)
7310 goto error;
7311 out = PyBytes_AS_STRING(*outbytes);
7312 }
7313 else {
7314 /* Extend string object */
7315 Py_ssize_t n = PyBytes_Size(*outbytes);
7316 if (n > PY_SSIZE_T_MAX - outsize) {
7317 PyErr_NoMemory();
7318 goto error;
7319 }
7320 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7321 goto error;
7322 out = PyBytes_AS_STRING(*outbytes) + n;
7323 }
7324
7325 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007326 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007327 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007328 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7329 wchar_t chars[2];
7330 int charsize;
7331 if (ch < 0x10000) {
7332 chars[0] = (wchar_t)ch;
7333 charsize = 1;
7334 }
7335 else {
7336 ch -= 0x10000;
7337 chars[0] = 0xd800 + (ch >> 10);
7338 chars[1] = 0xdc00 + (ch & 0x3ff);
7339 charsize = 2;
7340 }
7341
Victor Stinner3a50e702011-10-18 21:21:00 +02007342 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007343 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007344 buffer, Py_ARRAY_LENGTH(buffer),
7345 NULL, pusedDefaultChar);
7346 if (outsize > 0) {
7347 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7348 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007349 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007350 memcpy(out, buffer, outsize);
7351 out += outsize;
7352 continue;
7353 }
7354 }
7355 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7356 PyErr_SetFromWindowsErr(0);
7357 goto error;
7358 }
7359
Victor Stinner3a50e702011-10-18 21:21:00 +02007360 rep = unicode_encode_call_errorhandler(
7361 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007362 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007363 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007364 if (rep == NULL)
7365 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007366 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007367
7368 if (PyBytes_Check(rep)) {
7369 outsize = PyBytes_GET_SIZE(rep);
7370 if (outsize != 1) {
7371 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7372 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7373 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7374 Py_DECREF(rep);
7375 goto error;
7376 }
7377 out = PyBytes_AS_STRING(*outbytes) + offset;
7378 }
7379 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7380 out += outsize;
7381 }
7382 else {
7383 Py_ssize_t i;
7384 enum PyUnicode_Kind kind;
7385 void *data;
7386
7387 if (PyUnicode_READY(rep) < 0) {
7388 Py_DECREF(rep);
7389 goto error;
7390 }
7391
7392 outsize = PyUnicode_GET_LENGTH(rep);
7393 if (outsize != 1) {
7394 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7395 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7396 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7397 Py_DECREF(rep);
7398 goto error;
7399 }
7400 out = PyBytes_AS_STRING(*outbytes) + offset;
7401 }
7402 kind = PyUnicode_KIND(rep);
7403 data = PyUnicode_DATA(rep);
7404 for (i=0; i < outsize; i++) {
7405 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7406 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007407 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007408 encoding, unicode,
7409 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007410 "unable to encode error handler result to ASCII");
7411 Py_DECREF(rep);
7412 goto error;
7413 }
7414 *out = (unsigned char)ch;
7415 out++;
7416 }
7417 }
7418 Py_DECREF(rep);
7419 }
7420 /* write a NUL byte */
7421 *out = 0;
7422 outsize = out - PyBytes_AS_STRING(*outbytes);
7423 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7424 if (_PyBytes_Resize(outbytes, outsize) < 0)
7425 goto error;
7426 ret = 0;
7427
7428error:
7429 Py_XDECREF(encoding_obj);
7430 Py_XDECREF(errorHandler);
7431 Py_XDECREF(exc);
7432 return ret;
7433}
7434
Victor Stinner3a50e702011-10-18 21:21:00 +02007435static PyObject *
7436encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007437 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007438 const char *errors)
7439{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007440 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007441 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007442 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007443 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007444
Victor Stinner2fc507f2011-11-04 20:06:39 +01007445 if (PyUnicode_READY(unicode) < 0)
7446 return NULL;
7447 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007448
Victor Stinner3a50e702011-10-18 21:21:00 +02007449 if (code_page < 0) {
7450 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7451 return NULL;
7452 }
7453
Martin v. Löwis3d325192011-11-04 18:23:06 +01007454 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007455 return PyBytes_FromStringAndSize(NULL, 0);
7456
Victor Stinner7581cef2011-11-03 22:32:33 +01007457 offset = 0;
7458 do
7459 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007460#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007461 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007462 chunks. */
7463 if (len > INT_MAX/2) {
7464 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007465 done = 0;
7466 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007467 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007468#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007469 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007470 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007471 done = 1;
7472 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007473
Victor Stinner76a31a62011-11-04 00:05:13 +01007474 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007475 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007476 errors);
7477 if (ret == -2)
7478 ret = encode_code_page_errors(code_page, &outbytes,
7479 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007480 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007481 if (ret < 0) {
7482 Py_XDECREF(outbytes);
7483 return NULL;
7484 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007485
Victor Stinner7581cef2011-11-03 22:32:33 +01007486 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007487 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007488 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007489
Victor Stinner3a50e702011-10-18 21:21:00 +02007490 return outbytes;
7491}
7492
7493PyObject *
7494PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7495 Py_ssize_t size,
7496 const char *errors)
7497{
Victor Stinner7581cef2011-11-03 22:32:33 +01007498 PyObject *unicode, *res;
7499 unicode = PyUnicode_FromUnicode(p, size);
7500 if (unicode == NULL)
7501 return NULL;
7502 res = encode_code_page(CP_ACP, unicode, errors);
7503 Py_DECREF(unicode);
7504 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007505}
7506
7507PyObject *
7508PyUnicode_EncodeCodePage(int code_page,
7509 PyObject *unicode,
7510 const char *errors)
7511{
Victor Stinner7581cef2011-11-03 22:32:33 +01007512 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007513}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007514
Alexander Belopolsky40018472011-02-26 01:02:56 +00007515PyObject *
7516PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007517{
7518 if (!PyUnicode_Check(unicode)) {
7519 PyErr_BadArgument();
7520 return NULL;
7521 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007522 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007523}
7524
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007525#undef NEED_RETRY
7526
Victor Stinner99b95382011-07-04 14:23:54 +02007527#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007528
Guido van Rossumd57fd912000-03-10 22:53:23 +00007529/* --- Character Mapping Codec -------------------------------------------- */
7530
Alexander Belopolsky40018472011-02-26 01:02:56 +00007531PyObject *
7532PyUnicode_DecodeCharmap(const char *s,
7533 Py_ssize_t size,
7534 PyObject *mapping,
7535 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007536{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007537 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007538 Py_ssize_t startinpos;
7539 Py_ssize_t endinpos;
7540 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007541 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007542 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007543 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007544 PyObject *errorHandler = NULL;
7545 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007546
Guido van Rossumd57fd912000-03-10 22:53:23 +00007547 /* Default to Latin-1 */
7548 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007549 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007550
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007551 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007552 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007553 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007554 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007555 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007556 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007557 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007558 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007559 Py_ssize_t maplen;
7560 enum PyUnicode_Kind kind;
7561 void *data;
7562 Py_UCS4 x;
7563
7564 if (PyUnicode_READY(mapping) < 0)
7565 return NULL;
7566
7567 maplen = PyUnicode_GET_LENGTH(mapping);
7568 data = PyUnicode_DATA(mapping);
7569 kind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007570 while (s < e) {
7571 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007572
Benjamin Peterson29060642009-01-31 22:14:21 +00007573 if (ch < maplen)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007574 x = PyUnicode_READ(kind, data, ch);
7575 else
7576 x = 0xfffe; /* invalid value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007577
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007578 if (x == 0xfffe)
7579 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007580 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007581 startinpos = s-starts;
7582 endinpos = startinpos+1;
7583 if (unicode_decode_call_errorhandler(
7584 errors, &errorHandler,
7585 "charmap", "character maps to <undefined>",
7586 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007587 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007588 goto onError;
7589 }
7590 continue;
7591 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007592
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007593 if (unicode_putchar(&v, &outpos, x) < 0)
7594 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007595 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007596 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007597 }
7598 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007599 while (s < e) {
7600 unsigned char ch = *s;
7601 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007602
Benjamin Peterson29060642009-01-31 22:14:21 +00007603 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7604 w = PyLong_FromLong((long)ch);
7605 if (w == NULL)
7606 goto onError;
7607 x = PyObject_GetItem(mapping, w);
7608 Py_DECREF(w);
7609 if (x == NULL) {
7610 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7611 /* No mapping found means: mapping is undefined. */
7612 PyErr_Clear();
7613 x = Py_None;
7614 Py_INCREF(x);
7615 } else
7616 goto onError;
7617 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007618
Benjamin Peterson29060642009-01-31 22:14:21 +00007619 /* Apply mapping */
7620 if (PyLong_Check(x)) {
7621 long value = PyLong_AS_LONG(x);
7622 if (value < 0 || value > 65535) {
7623 PyErr_SetString(PyExc_TypeError,
7624 "character mapping must be in range(65536)");
7625 Py_DECREF(x);
7626 goto onError;
7627 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007628 if (unicode_putchar(&v, &outpos, value) < 0)
7629 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007630 }
7631 else if (x == Py_None) {
7632 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007633 startinpos = s-starts;
7634 endinpos = startinpos+1;
7635 if (unicode_decode_call_errorhandler(
7636 errors, &errorHandler,
7637 "charmap", "character maps to <undefined>",
7638 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007639 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007640 Py_DECREF(x);
7641 goto onError;
7642 }
7643 Py_DECREF(x);
7644 continue;
7645 }
7646 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007647 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007648
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007649 if (PyUnicode_READY(x) < 0)
7650 goto onError;
7651 targetsize = PyUnicode_GET_LENGTH(x);
7652
7653 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007654 /* 1-1 mapping */
Victor Stinner62aa4d02011-11-09 00:03:45 +01007655 if (unicode_putchar(&v, &outpos,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007656 PyUnicode_READ_CHAR(x, 0)) < 0)
7657 goto onError;
7658 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007659 else if (targetsize > 1) {
7660 /* 1-n mapping */
7661 if (targetsize > extrachars) {
7662 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007663 Py_ssize_t needed = (targetsize - extrachars) + \
7664 (targetsize << 2);
7665 extrachars += needed;
7666 /* XXX overflow detection missing */
Victor Stinner16e6a802011-12-12 13:24:15 +01007667 if (unicode_resize(&v,
7668 PyUnicode_GET_LENGTH(v) + needed) < 0)
7669 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007670 Py_DECREF(x);
7671 goto onError;
7672 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007673 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007674 if (unicode_widen(&v, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
7675 goto onError;
7676 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7677 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00007678 extrachars -= targetsize;
7679 }
7680 /* 1-0 mapping: skip the character */
7681 }
7682 else {
7683 /* wrong return value */
7684 PyErr_SetString(PyExc_TypeError,
7685 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007686 Py_DECREF(x);
7687 goto onError;
7688 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007689 Py_DECREF(x);
7690 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007691 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007692 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007693 if (unicode_resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01007694 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007695 Py_XDECREF(errorHandler);
7696 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007697 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00007698
Benjamin Peterson29060642009-01-31 22:14:21 +00007699 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007700 Py_XDECREF(errorHandler);
7701 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007702 Py_XDECREF(v);
7703 return NULL;
7704}
7705
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007706/* Charmap encoding: the lookup table */
7707
Alexander Belopolsky40018472011-02-26 01:02:56 +00007708struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007709 PyObject_HEAD
7710 unsigned char level1[32];
7711 int count2, count3;
7712 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007713};
7714
7715static PyObject*
7716encoding_map_size(PyObject *obj, PyObject* args)
7717{
7718 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007719 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007720 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007721}
7722
7723static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007724 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007725 PyDoc_STR("Return the size (in bytes) of this object") },
7726 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007727};
7728
7729static void
7730encoding_map_dealloc(PyObject* o)
7731{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007732 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007733}
7734
7735static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007736 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007737 "EncodingMap", /*tp_name*/
7738 sizeof(struct encoding_map), /*tp_basicsize*/
7739 0, /*tp_itemsize*/
7740 /* methods */
7741 encoding_map_dealloc, /*tp_dealloc*/
7742 0, /*tp_print*/
7743 0, /*tp_getattr*/
7744 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007745 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007746 0, /*tp_repr*/
7747 0, /*tp_as_number*/
7748 0, /*tp_as_sequence*/
7749 0, /*tp_as_mapping*/
7750 0, /*tp_hash*/
7751 0, /*tp_call*/
7752 0, /*tp_str*/
7753 0, /*tp_getattro*/
7754 0, /*tp_setattro*/
7755 0, /*tp_as_buffer*/
7756 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7757 0, /*tp_doc*/
7758 0, /*tp_traverse*/
7759 0, /*tp_clear*/
7760 0, /*tp_richcompare*/
7761 0, /*tp_weaklistoffset*/
7762 0, /*tp_iter*/
7763 0, /*tp_iternext*/
7764 encoding_map_methods, /*tp_methods*/
7765 0, /*tp_members*/
7766 0, /*tp_getset*/
7767 0, /*tp_base*/
7768 0, /*tp_dict*/
7769 0, /*tp_descr_get*/
7770 0, /*tp_descr_set*/
7771 0, /*tp_dictoffset*/
7772 0, /*tp_init*/
7773 0, /*tp_alloc*/
7774 0, /*tp_new*/
7775 0, /*tp_free*/
7776 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007777};
7778
7779PyObject*
7780PyUnicode_BuildEncodingMap(PyObject* string)
7781{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007782 PyObject *result;
7783 struct encoding_map *mresult;
7784 int i;
7785 int need_dict = 0;
7786 unsigned char level1[32];
7787 unsigned char level2[512];
7788 unsigned char *mlevel1, *mlevel2, *mlevel3;
7789 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007790 int kind;
7791 void *data;
7792 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007793
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007794 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007795 PyErr_BadArgument();
7796 return NULL;
7797 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007798 kind = PyUnicode_KIND(string);
7799 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007800 memset(level1, 0xFF, sizeof level1);
7801 memset(level2, 0xFF, sizeof level2);
7802
7803 /* If there isn't a one-to-one mapping of NULL to \0,
7804 or if there are non-BMP characters, we need to use
7805 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007806 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007807 need_dict = 1;
7808 for (i = 1; i < 256; i++) {
7809 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007810 ch = PyUnicode_READ(kind, data, i);
7811 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007812 need_dict = 1;
7813 break;
7814 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007815 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007816 /* unmapped character */
7817 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007818 l1 = ch >> 11;
7819 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007820 if (level1[l1] == 0xFF)
7821 level1[l1] = count2++;
7822 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007823 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007824 }
7825
7826 if (count2 >= 0xFF || count3 >= 0xFF)
7827 need_dict = 1;
7828
7829 if (need_dict) {
7830 PyObject *result = PyDict_New();
7831 PyObject *key, *value;
7832 if (!result)
7833 return NULL;
7834 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007835 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007836 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007837 if (!key || !value)
7838 goto failed1;
7839 if (PyDict_SetItem(result, key, value) == -1)
7840 goto failed1;
7841 Py_DECREF(key);
7842 Py_DECREF(value);
7843 }
7844 return result;
7845 failed1:
7846 Py_XDECREF(key);
7847 Py_XDECREF(value);
7848 Py_DECREF(result);
7849 return NULL;
7850 }
7851
7852 /* Create a three-level trie */
7853 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7854 16*count2 + 128*count3 - 1);
7855 if (!result)
7856 return PyErr_NoMemory();
7857 PyObject_Init(result, &EncodingMapType);
7858 mresult = (struct encoding_map*)result;
7859 mresult->count2 = count2;
7860 mresult->count3 = count3;
7861 mlevel1 = mresult->level1;
7862 mlevel2 = mresult->level23;
7863 mlevel3 = mresult->level23 + 16*count2;
7864 memcpy(mlevel1, level1, 32);
7865 memset(mlevel2, 0xFF, 16*count2);
7866 memset(mlevel3, 0, 128*count3);
7867 count3 = 0;
7868 for (i = 1; i < 256; i++) {
7869 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007870 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007871 /* unmapped character */
7872 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007873 o1 = PyUnicode_READ(kind, data, i)>>11;
7874 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007875 i2 = 16*mlevel1[o1] + o2;
7876 if (mlevel2[i2] == 0xFF)
7877 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007878 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007879 i3 = 128*mlevel2[i2] + o3;
7880 mlevel3[i3] = i;
7881 }
7882 return result;
7883}
7884
7885static int
Victor Stinner22168992011-11-20 17:09:18 +01007886encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007887{
7888 struct encoding_map *map = (struct encoding_map*)mapping;
7889 int l1 = c>>11;
7890 int l2 = (c>>7) & 0xF;
7891 int l3 = c & 0x7F;
7892 int i;
7893
Victor Stinner22168992011-11-20 17:09:18 +01007894 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007895 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007896 if (c == 0)
7897 return 0;
7898 /* level 1*/
7899 i = map->level1[l1];
7900 if (i == 0xFF) {
7901 return -1;
7902 }
7903 /* level 2*/
7904 i = map->level23[16*i+l2];
7905 if (i == 0xFF) {
7906 return -1;
7907 }
7908 /* level 3 */
7909 i = map->level23[16*map->count2 + 128*i + l3];
7910 if (i == 0) {
7911 return -1;
7912 }
7913 return i;
7914}
7915
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007916/* Lookup the character ch in the mapping. If the character
7917 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007918 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007919static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007920charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007921{
Christian Heimes217cfd12007-12-02 14:31:20 +00007922 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007923 PyObject *x;
7924
7925 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007926 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007927 x = PyObject_GetItem(mapping, w);
7928 Py_DECREF(w);
7929 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007930 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7931 /* No mapping found means: mapping is undefined. */
7932 PyErr_Clear();
7933 x = Py_None;
7934 Py_INCREF(x);
7935 return x;
7936 } else
7937 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007938 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007939 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007940 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007941 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007942 long value = PyLong_AS_LONG(x);
7943 if (value < 0 || value > 255) {
7944 PyErr_SetString(PyExc_TypeError,
7945 "character mapping must be in range(256)");
7946 Py_DECREF(x);
7947 return NULL;
7948 }
7949 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007950 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007951 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007952 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007953 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007954 /* wrong return value */
7955 PyErr_Format(PyExc_TypeError,
7956 "character mapping must return integer, bytes or None, not %.400s",
7957 x->ob_type->tp_name);
7958 Py_DECREF(x);
7959 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007960 }
7961}
7962
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007963static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007964charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007965{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007966 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7967 /* exponentially overallocate to minimize reallocations */
7968 if (requiredsize < 2*outsize)
7969 requiredsize = 2*outsize;
7970 if (_PyBytes_Resize(outobj, requiredsize))
7971 return -1;
7972 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007973}
7974
Benjamin Peterson14339b62009-01-31 16:36:08 +00007975typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007976 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007977} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007978/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007979 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007980 space is available. Return a new reference to the object that
7981 was put in the output buffer, or Py_None, if the mapping was undefined
7982 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007983 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007984static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01007985charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007986 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007987{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007988 PyObject *rep;
7989 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007990 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007991
Christian Heimes90aa7642007-12-19 02:45:37 +00007992 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007993 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007994 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007995 if (res == -1)
7996 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007997 if (outsize<requiredsize)
7998 if (charmapencode_resize(outobj, outpos, requiredsize))
7999 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008000 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008001 outstart[(*outpos)++] = (char)res;
8002 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008003 }
8004
8005 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008006 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008007 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008008 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008009 Py_DECREF(rep);
8010 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008011 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008012 if (PyLong_Check(rep)) {
8013 Py_ssize_t requiredsize = *outpos+1;
8014 if (outsize<requiredsize)
8015 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8016 Py_DECREF(rep);
8017 return enc_EXCEPTION;
8018 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008019 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008020 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008021 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008022 else {
8023 const char *repchars = PyBytes_AS_STRING(rep);
8024 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8025 Py_ssize_t requiredsize = *outpos+repsize;
8026 if (outsize<requiredsize)
8027 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8028 Py_DECREF(rep);
8029 return enc_EXCEPTION;
8030 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008031 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008032 memcpy(outstart + *outpos, repchars, repsize);
8033 *outpos += repsize;
8034 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008035 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008036 Py_DECREF(rep);
8037 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008038}
8039
8040/* handle an error in PyUnicode_EncodeCharmap
8041 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008042static int
8043charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008044 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008045 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008046 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008047 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008048{
8049 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008050 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008051 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008052 enum PyUnicode_Kind kind;
8053 void *data;
8054 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008055 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008056 Py_ssize_t collstartpos = *inpos;
8057 Py_ssize_t collendpos = *inpos+1;
8058 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008059 char *encoding = "charmap";
8060 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008061 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008062 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008063 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008064
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008065 if (PyUnicode_READY(unicode) < 0)
8066 return -1;
8067 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008068 /* find all unencodable characters */
8069 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008070 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008071 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008072 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008073 val = encoding_map_lookup(ch, mapping);
8074 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008075 break;
8076 ++collendpos;
8077 continue;
8078 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008079
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008080 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8081 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008082 if (rep==NULL)
8083 return -1;
8084 else if (rep!=Py_None) {
8085 Py_DECREF(rep);
8086 break;
8087 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008088 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008089 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008090 }
8091 /* cache callback name lookup
8092 * (if not done yet, i.e. it's the first error) */
8093 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008094 if ((errors==NULL) || (!strcmp(errors, "strict")))
8095 *known_errorHandler = 1;
8096 else if (!strcmp(errors, "replace"))
8097 *known_errorHandler = 2;
8098 else if (!strcmp(errors, "ignore"))
8099 *known_errorHandler = 3;
8100 else if (!strcmp(errors, "xmlcharrefreplace"))
8101 *known_errorHandler = 4;
8102 else
8103 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008104 }
8105 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008106 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008107 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008108 return -1;
8109 case 2: /* replace */
8110 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008111 x = charmapencode_output('?', mapping, res, respos);
8112 if (x==enc_EXCEPTION) {
8113 return -1;
8114 }
8115 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008116 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008117 return -1;
8118 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008119 }
8120 /* fall through */
8121 case 3: /* ignore */
8122 *inpos = collendpos;
8123 break;
8124 case 4: /* xmlcharrefreplace */
8125 /* generate replacement (temporarily (mis)uses p) */
8126 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008127 char buffer[2+29+1+1];
8128 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008129 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008130 for (cp = buffer; *cp; ++cp) {
8131 x = charmapencode_output(*cp, mapping, res, respos);
8132 if (x==enc_EXCEPTION)
8133 return -1;
8134 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008135 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008136 return -1;
8137 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008138 }
8139 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008140 *inpos = collendpos;
8141 break;
8142 default:
8143 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008144 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008145 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008146 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008147 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008148 if (PyBytes_Check(repunicode)) {
8149 /* Directly copy bytes result to output. */
8150 Py_ssize_t outsize = PyBytes_Size(*res);
8151 Py_ssize_t requiredsize;
8152 repsize = PyBytes_Size(repunicode);
8153 requiredsize = *respos + repsize;
8154 if (requiredsize > outsize)
8155 /* Make room for all additional bytes. */
8156 if (charmapencode_resize(res, respos, requiredsize)) {
8157 Py_DECREF(repunicode);
8158 return -1;
8159 }
8160 memcpy(PyBytes_AsString(*res) + *respos,
8161 PyBytes_AsString(repunicode), repsize);
8162 *respos += repsize;
8163 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008164 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008165 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008166 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008167 /* generate replacement */
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008168 if (PyUnicode_READY(repunicode) < 0) {
8169 Py_DECREF(repunicode);
8170 return -1;
8171 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008172 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008173 data = PyUnicode_DATA(repunicode);
8174 kind = PyUnicode_KIND(repunicode);
8175 for (index = 0; index < repsize; index++) {
8176 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8177 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008178 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008179 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008180 return -1;
8181 }
8182 else if (x==enc_FAILED) {
8183 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008184 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008185 return -1;
8186 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008187 }
8188 *inpos = newpos;
8189 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008190 }
8191 return 0;
8192}
8193
Alexander Belopolsky40018472011-02-26 01:02:56 +00008194PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008195_PyUnicode_EncodeCharmap(PyObject *unicode,
8196 PyObject *mapping,
8197 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008198{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008199 /* output object */
8200 PyObject *res = NULL;
8201 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008202 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008203 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008204 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008205 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008206 PyObject *errorHandler = NULL;
8207 PyObject *exc = NULL;
8208 /* the following variable is used for caching string comparisons
8209 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8210 * 3=ignore, 4=xmlcharrefreplace */
8211 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008212
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008213 if (PyUnicode_READY(unicode) < 0)
8214 return NULL;
8215 size = PyUnicode_GET_LENGTH(unicode);
8216
Guido van Rossumd57fd912000-03-10 22:53:23 +00008217 /* Default to Latin-1 */
8218 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008219 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008220
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008221 /* allocate enough for a simple encoding without
8222 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008223 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008224 if (res == NULL)
8225 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008226 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008227 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008228
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008229 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008230 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008231 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008232 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008233 if (x==enc_EXCEPTION) /* error */
8234 goto onError;
8235 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008236 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008237 &exc,
8238 &known_errorHandler, &errorHandler, errors,
8239 &res, &respos)) {
8240 goto onError;
8241 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008242 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008243 else
8244 /* done with this character => adjust input position */
8245 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008246 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008247
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008248 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008249 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008250 if (_PyBytes_Resize(&res, respos) < 0)
8251 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008252
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008253 Py_XDECREF(exc);
8254 Py_XDECREF(errorHandler);
8255 return res;
8256
Benjamin Peterson29060642009-01-31 22:14:21 +00008257 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008258 Py_XDECREF(res);
8259 Py_XDECREF(exc);
8260 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008261 return NULL;
8262}
8263
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008264/* Deprecated */
8265PyObject *
8266PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8267 Py_ssize_t size,
8268 PyObject *mapping,
8269 const char *errors)
8270{
8271 PyObject *result;
8272 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8273 if (unicode == NULL)
8274 return NULL;
8275 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8276 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008277 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008278}
8279
Alexander Belopolsky40018472011-02-26 01:02:56 +00008280PyObject *
8281PyUnicode_AsCharmapString(PyObject *unicode,
8282 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008283{
8284 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008285 PyErr_BadArgument();
8286 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008287 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008288 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008289}
8290
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008291/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008292static void
8293make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008294 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008295 Py_ssize_t startpos, Py_ssize_t endpos,
8296 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008297{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008298 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008299 *exceptionObject = _PyUnicodeTranslateError_Create(
8300 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008301 }
8302 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008303 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8304 goto onError;
8305 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8306 goto onError;
8307 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8308 goto onError;
8309 return;
8310 onError:
8311 Py_DECREF(*exceptionObject);
8312 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008313 }
8314}
8315
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008316/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008317static void
8318raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008319 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008320 Py_ssize_t startpos, Py_ssize_t endpos,
8321 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008322{
8323 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008324 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008325 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008326 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008327}
8328
8329/* error handling callback helper:
8330 build arguments, call the callback and check the arguments,
8331 put the result into newpos and return the replacement string, which
8332 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008333static PyObject *
8334unicode_translate_call_errorhandler(const char *errors,
8335 PyObject **errorHandler,
8336 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008337 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008338 Py_ssize_t startpos, Py_ssize_t endpos,
8339 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008340{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008341 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008342
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008343 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008344 PyObject *restuple;
8345 PyObject *resunicode;
8346
8347 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008348 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008349 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008350 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008351 }
8352
8353 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008354 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008355 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008356 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008357
8358 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008359 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008360 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008361 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008362 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008363 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008364 Py_DECREF(restuple);
8365 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008366 }
8367 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008368 &resunicode, &i_newpos)) {
8369 Py_DECREF(restuple);
8370 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008371 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008372 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008373 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008374 else
8375 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008376 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008377 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8378 Py_DECREF(restuple);
8379 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008380 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008381 Py_INCREF(resunicode);
8382 Py_DECREF(restuple);
8383 return resunicode;
8384}
8385
8386/* Lookup the character ch in the mapping and put the result in result,
8387 which must be decrefed by the caller.
8388 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008389static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008390charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008391{
Christian Heimes217cfd12007-12-02 14:31:20 +00008392 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008393 PyObject *x;
8394
8395 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008396 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008397 x = PyObject_GetItem(mapping, w);
8398 Py_DECREF(w);
8399 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008400 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8401 /* No mapping found means: use 1:1 mapping. */
8402 PyErr_Clear();
8403 *result = NULL;
8404 return 0;
8405 } else
8406 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008407 }
8408 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008409 *result = x;
8410 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008411 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008412 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008413 long value = PyLong_AS_LONG(x);
8414 long max = PyUnicode_GetMax();
8415 if (value < 0 || value > max) {
8416 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008417 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008418 Py_DECREF(x);
8419 return -1;
8420 }
8421 *result = x;
8422 return 0;
8423 }
8424 else if (PyUnicode_Check(x)) {
8425 *result = x;
8426 return 0;
8427 }
8428 else {
8429 /* wrong return value */
8430 PyErr_SetString(PyExc_TypeError,
8431 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008432 Py_DECREF(x);
8433 return -1;
8434 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008435}
8436/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008437 if not reallocate and adjust various state variables.
8438 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008439static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008440charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008441 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008442{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008443 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00008444 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008445 /* exponentially overallocate to minimize reallocations */
8446 if (requiredsize < 2 * oldsize)
8447 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008448 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8449 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008450 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008451 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008452 }
8453 return 0;
8454}
8455/* lookup the character, put the result in the output string and adjust
8456 various state variables. Return a new reference to the object that
8457 was put in the output buffer in *result, or Py_None, if the mapping was
8458 undefined (in which case no character was written).
8459 The called must decref result.
8460 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008461static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008462charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8463 PyObject *mapping, Py_UCS4 **output,
8464 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008465 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008466{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008467 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8468 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008469 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008470 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008471 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008472 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008473 }
8474 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008475 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008476 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008477 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008478 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008479 }
8480 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008481 Py_ssize_t repsize;
8482 if (PyUnicode_READY(*res) == -1)
8483 return -1;
8484 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008485 if (repsize==1) {
8486 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008487 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008488 }
8489 else if (repsize!=0) {
8490 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008491 Py_ssize_t requiredsize = *opos +
8492 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008493 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008494 Py_ssize_t i;
8495 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008496 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008497 for(i = 0; i < repsize; i++)
8498 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008499 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008500 }
8501 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008502 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008503 return 0;
8504}
8505
Alexander Belopolsky40018472011-02-26 01:02:56 +00008506PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008507_PyUnicode_TranslateCharmap(PyObject *input,
8508 PyObject *mapping,
8509 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008510{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008511 /* input object */
8512 char *idata;
8513 Py_ssize_t size, i;
8514 int kind;
8515 /* output buffer */
8516 Py_UCS4 *output = NULL;
8517 Py_ssize_t osize;
8518 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008519 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008520 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008521 char *reason = "character maps to <undefined>";
8522 PyObject *errorHandler = NULL;
8523 PyObject *exc = NULL;
8524 /* the following variable is used for caching string comparisons
8525 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8526 * 3=ignore, 4=xmlcharrefreplace */
8527 int known_errorHandler = -1;
8528
Guido van Rossumd57fd912000-03-10 22:53:23 +00008529 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008530 PyErr_BadArgument();
8531 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008532 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008533
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008534 if (PyUnicode_READY(input) == -1)
8535 return NULL;
8536 idata = (char*)PyUnicode_DATA(input);
8537 kind = PyUnicode_KIND(input);
8538 size = PyUnicode_GET_LENGTH(input);
8539 i = 0;
8540
8541 if (size == 0) {
8542 Py_INCREF(input);
8543 return input;
8544 }
8545
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008546 /* allocate enough for a simple 1:1 translation without
8547 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008548 osize = size;
8549 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8550 opos = 0;
8551 if (output == NULL) {
8552 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008553 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008554 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008555
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008556 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008557 /* try to encode it */
8558 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008559 if (charmaptranslate_output(input, i, mapping,
8560 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008561 Py_XDECREF(x);
8562 goto onError;
8563 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008564 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008565 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008566 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008567 else { /* untranslatable character */
8568 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8569 Py_ssize_t repsize;
8570 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008571 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008572 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008573 Py_ssize_t collstart = i;
8574 Py_ssize_t collend = i+1;
8575 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008576
Benjamin Peterson29060642009-01-31 22:14:21 +00008577 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008578 while (collend < size) {
8579 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008580 goto onError;
8581 Py_XDECREF(x);
8582 if (x!=Py_None)
8583 break;
8584 ++collend;
8585 }
8586 /* cache callback name lookup
8587 * (if not done yet, i.e. it's the first error) */
8588 if (known_errorHandler==-1) {
8589 if ((errors==NULL) || (!strcmp(errors, "strict")))
8590 known_errorHandler = 1;
8591 else if (!strcmp(errors, "replace"))
8592 known_errorHandler = 2;
8593 else if (!strcmp(errors, "ignore"))
8594 known_errorHandler = 3;
8595 else if (!strcmp(errors, "xmlcharrefreplace"))
8596 known_errorHandler = 4;
8597 else
8598 known_errorHandler = 0;
8599 }
8600 switch (known_errorHandler) {
8601 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008602 raise_translate_exception(&exc, input, collstart,
8603 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008604 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008605 case 2: /* replace */
8606 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008607 for (coll = collstart; coll<collend; coll++)
8608 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008609 /* fall through */
8610 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008611 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008612 break;
8613 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008614 /* generate replacement (temporarily (mis)uses i) */
8615 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008616 char buffer[2+29+1+1];
8617 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008618 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8619 if (charmaptranslate_makespace(&output, &osize,
8620 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008621 goto onError;
8622 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008623 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008624 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008625 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008626 break;
8627 default:
8628 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008629 reason, input, &exc,
8630 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008631 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008632 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008633 if (PyUnicode_READY(repunicode) < 0) {
8634 Py_DECREF(repunicode);
8635 goto onError;
8636 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008637 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008638 repsize = PyUnicode_GET_LENGTH(repunicode);
8639 if (charmaptranslate_makespace(&output, &osize,
8640 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008641 Py_DECREF(repunicode);
8642 goto onError;
8643 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008644 for (uni2 = 0; repsize-->0; ++uni2)
8645 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8646 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008647 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008648 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008649 }
8650 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008651 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8652 if (!res)
8653 goto onError;
8654 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008655 Py_XDECREF(exc);
8656 Py_XDECREF(errorHandler);
8657 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008658
Benjamin Peterson29060642009-01-31 22:14:21 +00008659 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008660 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008661 Py_XDECREF(exc);
8662 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008663 return NULL;
8664}
8665
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008666/* Deprecated. Use PyUnicode_Translate instead. */
8667PyObject *
8668PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8669 Py_ssize_t size,
8670 PyObject *mapping,
8671 const char *errors)
8672{
8673 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8674 if (!unicode)
8675 return NULL;
8676 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8677}
8678
Alexander Belopolsky40018472011-02-26 01:02:56 +00008679PyObject *
8680PyUnicode_Translate(PyObject *str,
8681 PyObject *mapping,
8682 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008683{
8684 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008685
Guido van Rossumd57fd912000-03-10 22:53:23 +00008686 str = PyUnicode_FromObject(str);
8687 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008688 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008689 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008690 Py_DECREF(str);
8691 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008692
Benjamin Peterson29060642009-01-31 22:14:21 +00008693 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008694 Py_XDECREF(str);
8695 return NULL;
8696}
Tim Petersced69f82003-09-16 20:30:58 +00008697
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008698static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008699fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008700{
8701 /* No need to call PyUnicode_READY(self) because this function is only
8702 called as a callback from fixup() which does it already. */
8703 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8704 const int kind = PyUnicode_KIND(self);
8705 void *data = PyUnicode_DATA(self);
8706 Py_UCS4 maxchar = 0, ch, fixed;
8707 Py_ssize_t i;
8708
8709 for (i = 0; i < len; ++i) {
8710 ch = PyUnicode_READ(kind, data, i);
8711 fixed = 0;
8712 if (ch > 127) {
8713 if (Py_UNICODE_ISSPACE(ch))
8714 fixed = ' ';
8715 else {
8716 const int decimal = Py_UNICODE_TODECIMAL(ch);
8717 if (decimal >= 0)
8718 fixed = '0' + decimal;
8719 }
8720 if (fixed != 0) {
8721 if (fixed > maxchar)
8722 maxchar = fixed;
8723 PyUnicode_WRITE(kind, data, i, fixed);
8724 }
8725 else if (ch > maxchar)
8726 maxchar = ch;
8727 }
8728 else if (ch > maxchar)
8729 maxchar = ch;
8730 }
8731
8732 return maxchar;
8733}
8734
8735PyObject *
8736_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8737{
8738 if (!PyUnicode_Check(unicode)) {
8739 PyErr_BadInternalCall();
8740 return NULL;
8741 }
8742 if (PyUnicode_READY(unicode) == -1)
8743 return NULL;
8744 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8745 /* If the string is already ASCII, just return the same string */
8746 Py_INCREF(unicode);
8747 return unicode;
8748 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008749 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008750}
8751
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008752PyObject *
8753PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8754 Py_ssize_t length)
8755{
Victor Stinnerf0124502011-11-21 23:12:56 +01008756 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008757 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008758 Py_UCS4 maxchar;
8759 enum PyUnicode_Kind kind;
8760 void *data;
8761
8762 maxchar = 0;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008763 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008764 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008765 if (ch > 127) {
8766 int decimal = Py_UNICODE_TODECIMAL(ch);
8767 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008768 ch = '0' + decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008769 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008770 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008771 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008772
8773 /* Copy to a new string */
8774 decimal = PyUnicode_New(length, maxchar);
8775 if (decimal == NULL)
8776 return decimal;
8777 kind = PyUnicode_KIND(decimal);
8778 data = PyUnicode_DATA(decimal);
8779 /* Iterate over code points */
8780 for (i = 0; i < length; i++) {
8781 Py_UNICODE ch = s[i];
8782 if (ch > 127) {
8783 int decimal = Py_UNICODE_TODECIMAL(ch);
8784 if (decimal >= 0)
8785 ch = '0' + decimal;
8786 }
8787 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008788 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008789 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008790}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008791/* --- Decimal Encoder ---------------------------------------------------- */
8792
Alexander Belopolsky40018472011-02-26 01:02:56 +00008793int
8794PyUnicode_EncodeDecimal(Py_UNICODE *s,
8795 Py_ssize_t length,
8796 char *output,
8797 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008798{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008799 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008800 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008801 enum PyUnicode_Kind kind;
8802 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008803
8804 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008805 PyErr_BadArgument();
8806 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008807 }
8808
Victor Stinner42bf7752011-11-21 22:52:58 +01008809 unicode = PyUnicode_FromUnicode(s, length);
8810 if (unicode == NULL)
8811 return -1;
8812
Victor Stinner6345be92011-11-25 20:09:01 +01008813 if (PyUnicode_READY(unicode) < 0) {
8814 Py_DECREF(unicode);
8815 return -1;
8816 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008817 kind = PyUnicode_KIND(unicode);
8818 data = PyUnicode_DATA(unicode);
8819
Victor Stinnerb84d7232011-11-22 01:50:07 +01008820 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008821 PyObject *exc;
8822 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008823 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008824 Py_ssize_t startpos;
8825
8826 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008827
Benjamin Peterson29060642009-01-31 22:14:21 +00008828 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008829 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008830 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008831 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008832 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008833 decimal = Py_UNICODE_TODECIMAL(ch);
8834 if (decimal >= 0) {
8835 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008836 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008837 continue;
8838 }
8839 if (0 < ch && ch < 256) {
8840 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008841 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008842 continue;
8843 }
Victor Stinner6345be92011-11-25 20:09:01 +01008844
Victor Stinner42bf7752011-11-21 22:52:58 +01008845 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008846 exc = NULL;
8847 raise_encode_exception(&exc, "decimal", unicode,
8848 startpos, startpos+1,
8849 "invalid decimal Unicode string");
8850 Py_XDECREF(exc);
8851 Py_DECREF(unicode);
8852 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008853 }
8854 /* 0-terminate the output string */
8855 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008856 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008857 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008858}
8859
Guido van Rossumd57fd912000-03-10 22:53:23 +00008860/* --- Helpers ------------------------------------------------------------ */
8861
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008862static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008863any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008864 Py_ssize_t start,
8865 Py_ssize_t end)
8866{
8867 int kind1, kind2, kind;
8868 void *buf1, *buf2;
8869 Py_ssize_t len1, len2, result;
8870
8871 kind1 = PyUnicode_KIND(s1);
8872 kind2 = PyUnicode_KIND(s2);
8873 kind = kind1 > kind2 ? kind1 : kind2;
8874 buf1 = PyUnicode_DATA(s1);
8875 buf2 = PyUnicode_DATA(s2);
8876 if (kind1 != kind)
8877 buf1 = _PyUnicode_AsKind(s1, kind);
8878 if (!buf1)
8879 return -2;
8880 if (kind2 != kind)
8881 buf2 = _PyUnicode_AsKind(s2, kind);
8882 if (!buf2) {
8883 if (kind1 != kind) PyMem_Free(buf1);
8884 return -2;
8885 }
8886 len1 = PyUnicode_GET_LENGTH(s1);
8887 len2 = PyUnicode_GET_LENGTH(s2);
8888
Victor Stinner794d5672011-10-10 03:21:36 +02008889 if (direction > 0) {
8890 switch(kind) {
8891 case PyUnicode_1BYTE_KIND:
8892 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8893 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8894 else
8895 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8896 break;
8897 case PyUnicode_2BYTE_KIND:
8898 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8899 break;
8900 case PyUnicode_4BYTE_KIND:
8901 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8902 break;
8903 default:
8904 assert(0); result = -2;
8905 }
8906 }
8907 else {
8908 switch(kind) {
8909 case PyUnicode_1BYTE_KIND:
8910 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8911 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8912 else
8913 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8914 break;
8915 case PyUnicode_2BYTE_KIND:
8916 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8917 break;
8918 case PyUnicode_4BYTE_KIND:
8919 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8920 break;
8921 default:
8922 assert(0); result = -2;
8923 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008924 }
8925
8926 if (kind1 != kind)
8927 PyMem_Free(buf1);
8928 if (kind2 != kind)
8929 PyMem_Free(buf2);
8930
8931 return result;
8932}
8933
8934Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02008935_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008936 Py_ssize_t n_buffer,
8937 void *digits, Py_ssize_t n_digits,
8938 Py_ssize_t min_width,
8939 const char *grouping,
8940 const char *thousands_sep)
8941{
8942 switch(kind) {
8943 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008944 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
8945 return _PyUnicode_ascii_InsertThousandsGrouping(
8946 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8947 min_width, grouping, thousands_sep);
8948 else
8949 return _PyUnicode_ucs1_InsertThousandsGrouping(
8950 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8951 min_width, grouping, thousands_sep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008952 case PyUnicode_2BYTE_KIND:
8953 return _PyUnicode_ucs2_InsertThousandsGrouping(
8954 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8955 min_width, grouping, thousands_sep);
8956 case PyUnicode_4BYTE_KIND:
8957 return _PyUnicode_ucs4_InsertThousandsGrouping(
8958 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
8959 min_width, grouping, thousands_sep);
8960 }
8961 assert(0);
8962 return -1;
8963}
8964
8965
Thomas Wouters477c8d52006-05-27 19:21:47 +00008966/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008967#define ADJUST_INDICES(start, end, len) \
8968 if (end > len) \
8969 end = len; \
8970 else if (end < 0) { \
8971 end += len; \
8972 if (end < 0) \
8973 end = 0; \
8974 } \
8975 if (start < 0) { \
8976 start += len; \
8977 if (start < 0) \
8978 start = 0; \
8979 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008980
Alexander Belopolsky40018472011-02-26 01:02:56 +00008981Py_ssize_t
8982PyUnicode_Count(PyObject *str,
8983 PyObject *substr,
8984 Py_ssize_t start,
8985 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008986{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008987 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008988 PyObject* str_obj;
8989 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008990 int kind1, kind2, kind;
8991 void *buf1 = NULL, *buf2 = NULL;
8992 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008993
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008994 str_obj = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008995 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008996 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008997 sub_obj = PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02008998 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008999 Py_DECREF(str_obj);
9000 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009001 }
Tim Petersced69f82003-09-16 20:30:58 +00009002
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009003 kind1 = PyUnicode_KIND(str_obj);
9004 kind2 = PyUnicode_KIND(sub_obj);
9005 kind = kind1 > kind2 ? kind1 : kind2;
9006 buf1 = PyUnicode_DATA(str_obj);
9007 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009008 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009009 if (!buf1)
9010 goto onError;
9011 buf2 = PyUnicode_DATA(sub_obj);
9012 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009013 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009014 if (!buf2)
9015 goto onError;
9016 len1 = PyUnicode_GET_LENGTH(str_obj);
9017 len2 = PyUnicode_GET_LENGTH(sub_obj);
9018
9019 ADJUST_INDICES(start, end, len1);
9020 switch(kind) {
9021 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009022 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9023 result = asciilib_count(
9024 ((Py_UCS1*)buf1) + start, end - start,
9025 buf2, len2, PY_SSIZE_T_MAX
9026 );
9027 else
9028 result = ucs1lib_count(
9029 ((Py_UCS1*)buf1) + start, end - start,
9030 buf2, len2, PY_SSIZE_T_MAX
9031 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009032 break;
9033 case PyUnicode_2BYTE_KIND:
9034 result = ucs2lib_count(
9035 ((Py_UCS2*)buf1) + start, end - start,
9036 buf2, len2, PY_SSIZE_T_MAX
9037 );
9038 break;
9039 case PyUnicode_4BYTE_KIND:
9040 result = ucs4lib_count(
9041 ((Py_UCS4*)buf1) + start, end - start,
9042 buf2, len2, PY_SSIZE_T_MAX
9043 );
9044 break;
9045 default:
9046 assert(0); result = 0;
9047 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009048
9049 Py_DECREF(sub_obj);
9050 Py_DECREF(str_obj);
9051
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009052 if (kind1 != kind)
9053 PyMem_Free(buf1);
9054 if (kind2 != kind)
9055 PyMem_Free(buf2);
9056
Guido van Rossumd57fd912000-03-10 22:53:23 +00009057 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009058 onError:
9059 Py_DECREF(sub_obj);
9060 Py_DECREF(str_obj);
9061 if (kind1 != kind && buf1)
9062 PyMem_Free(buf1);
9063 if (kind2 != kind && buf2)
9064 PyMem_Free(buf2);
9065 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009066}
9067
Alexander Belopolsky40018472011-02-26 01:02:56 +00009068Py_ssize_t
9069PyUnicode_Find(PyObject *str,
9070 PyObject *sub,
9071 Py_ssize_t start,
9072 Py_ssize_t end,
9073 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009074{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009075 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009076
Guido van Rossumd57fd912000-03-10 22:53:23 +00009077 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009078 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009079 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009080 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009081 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009082 Py_DECREF(str);
9083 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009084 }
Tim Petersced69f82003-09-16 20:30:58 +00009085
Victor Stinner794d5672011-10-10 03:21:36 +02009086 result = any_find_slice(direction,
9087 str, sub, start, end
9088 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009089
Guido van Rossumd57fd912000-03-10 22:53:23 +00009090 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009091 Py_DECREF(sub);
9092
Guido van Rossumd57fd912000-03-10 22:53:23 +00009093 return result;
9094}
9095
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009096Py_ssize_t
9097PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9098 Py_ssize_t start, Py_ssize_t end,
9099 int direction)
9100{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009101 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009102 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009103 if (PyUnicode_READY(str) == -1)
9104 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009105 if (start < 0 || end < 0) {
9106 PyErr_SetString(PyExc_IndexError, "string index out of range");
9107 return -2;
9108 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009109 if (end > PyUnicode_GET_LENGTH(str))
9110 end = PyUnicode_GET_LENGTH(str);
9111 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009112 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9113 kind, end-start, ch, direction);
9114 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009115 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009116 else
9117 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009118}
9119
Alexander Belopolsky40018472011-02-26 01:02:56 +00009120static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009121tailmatch(PyObject *self,
9122 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009123 Py_ssize_t start,
9124 Py_ssize_t end,
9125 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009126{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009127 int kind_self;
9128 int kind_sub;
9129 void *data_self;
9130 void *data_sub;
9131 Py_ssize_t offset;
9132 Py_ssize_t i;
9133 Py_ssize_t end_sub;
9134
9135 if (PyUnicode_READY(self) == -1 ||
9136 PyUnicode_READY(substring) == -1)
9137 return 0;
9138
9139 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009140 return 1;
9141
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009142 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9143 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009144 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009145 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009146
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009147 kind_self = PyUnicode_KIND(self);
9148 data_self = PyUnicode_DATA(self);
9149 kind_sub = PyUnicode_KIND(substring);
9150 data_sub = PyUnicode_DATA(substring);
9151 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9152
9153 if (direction > 0)
9154 offset = end;
9155 else
9156 offset = start;
9157
9158 if (PyUnicode_READ(kind_self, data_self, offset) ==
9159 PyUnicode_READ(kind_sub, data_sub, 0) &&
9160 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9161 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9162 /* If both are of the same kind, memcmp is sufficient */
9163 if (kind_self == kind_sub) {
9164 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009165 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009166 data_sub,
9167 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009168 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009169 }
9170 /* otherwise we have to compare each character by first accesing it */
9171 else {
9172 /* We do not need to compare 0 and len(substring)-1 because
9173 the if statement above ensured already that they are equal
9174 when we end up here. */
9175 // TODO: honor direction and do a forward or backwards search
9176 for (i = 1; i < end_sub; ++i) {
9177 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9178 PyUnicode_READ(kind_sub, data_sub, i))
9179 return 0;
9180 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009181 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009182 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009183 }
9184
9185 return 0;
9186}
9187
Alexander Belopolsky40018472011-02-26 01:02:56 +00009188Py_ssize_t
9189PyUnicode_Tailmatch(PyObject *str,
9190 PyObject *substr,
9191 Py_ssize_t start,
9192 Py_ssize_t end,
9193 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009194{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009195 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009196
Guido van Rossumd57fd912000-03-10 22:53:23 +00009197 str = PyUnicode_FromObject(str);
9198 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009199 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009200 substr = PyUnicode_FromObject(substr);
9201 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009202 Py_DECREF(str);
9203 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009204 }
Tim Petersced69f82003-09-16 20:30:58 +00009205
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009206 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009207 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009208 Py_DECREF(str);
9209 Py_DECREF(substr);
9210 return result;
9211}
9212
Guido van Rossumd57fd912000-03-10 22:53:23 +00009213/* Apply fixfct filter to the Unicode object self and return a
9214 reference to the modified object */
9215
Alexander Belopolsky40018472011-02-26 01:02:56 +00009216static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009217fixup(PyObject *self,
9218 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009219{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009220 PyObject *u;
9221 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009222 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009223
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009224 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009225 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009226 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009227 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009228
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009229 /* fix functions return the new maximum character in a string,
9230 if the kind of the resulting unicode object does not change,
9231 everything is fine. Otherwise we need to change the string kind
9232 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009233 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009234
9235 if (maxchar_new == 0) {
9236 /* no changes */;
9237 if (PyUnicode_CheckExact(self)) {
9238 Py_DECREF(u);
9239 Py_INCREF(self);
9240 return self;
9241 }
9242 else
9243 return u;
9244 }
9245
9246 if (maxchar_new <= 127)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009247 maxchar_new = 127;
9248 else if (maxchar_new <= 255)
9249 maxchar_new = 255;
9250 else if (maxchar_new <= 65535)
9251 maxchar_new = 65535;
9252 else
Victor Stinner8faf8212011-12-08 22:14:11 +01009253 maxchar_new = MAX_UNICODE;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009254
Victor Stinnereaab6042011-12-11 22:22:39 +01009255 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009256 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009257
9258 /* In case the maximum character changed, we need to
9259 convert the string to the new category. */
9260 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9261 if (v == NULL) {
9262 Py_DECREF(u);
9263 return NULL;
9264 }
9265 if (maxchar_new > maxchar_old) {
9266 /* If the maxchar increased so that the kind changed, not all
9267 characters are representable anymore and we need to fix the
9268 string again. This only happens in very few cases. */
9269 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
9270 maxchar_old = fixfct(v);
9271 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009272 }
9273 else {
Victor Stinnereaab6042011-12-11 22:22:39 +01009274 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009275 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009276 Py_DECREF(u);
9277 assert(_PyUnicode_CheckConsistency(v, 1));
9278 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009279}
9280
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009281static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009282fixupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009283{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009284 /* No need to call PyUnicode_READY(self) because this function is only
9285 called as a callback from fixup() which does it already. */
9286 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9287 const int kind = PyUnicode_KIND(self);
9288 void *data = PyUnicode_DATA(self);
9289 int touched = 0;
9290 Py_UCS4 maxchar = 0;
9291 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009292
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009293 for (i = 0; i < len; ++i) {
9294 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9295 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
9296 if (up != ch) {
9297 if (up > maxchar)
9298 maxchar = up;
9299 PyUnicode_WRITE(kind, data, i, up);
9300 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009301 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009302 else if (ch > maxchar)
9303 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009304 }
9305
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009306 if (touched)
9307 return maxchar;
9308 else
9309 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009310}
9311
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009312static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009313fixlower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009314{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009315 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9316 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9317 const int kind = PyUnicode_KIND(self);
9318 void *data = PyUnicode_DATA(self);
9319 int touched = 0;
9320 Py_UCS4 maxchar = 0;
9321 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009322
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009323 for(i = 0; i < len; ++i) {
9324 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9325 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9326 if (lo != ch) {
9327 if (lo > maxchar)
9328 maxchar = lo;
9329 PyUnicode_WRITE(kind, data, i, lo);
9330 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009331 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009332 else if (ch > maxchar)
9333 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009334 }
9335
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009336 if (touched)
9337 return maxchar;
9338 else
9339 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009340}
9341
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009342static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009343fixswapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009344{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009345 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9346 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9347 const int kind = PyUnicode_KIND(self);
9348 void *data = PyUnicode_DATA(self);
9349 int touched = 0;
9350 Py_UCS4 maxchar = 0;
9351 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009352
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009353 for(i = 0; i < len; ++i) {
9354 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9355 Py_UCS4 nu = 0;
9356
9357 if (Py_UNICODE_ISUPPER(ch))
9358 nu = Py_UNICODE_TOLOWER(ch);
9359 else if (Py_UNICODE_ISLOWER(ch))
9360 nu = Py_UNICODE_TOUPPER(ch);
9361
9362 if (nu != 0) {
9363 if (nu > maxchar)
9364 maxchar = nu;
9365 PyUnicode_WRITE(kind, data, i, nu);
9366 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009367 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009368 else if (ch > maxchar)
9369 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009370 }
9371
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009372 if (touched)
9373 return maxchar;
9374 else
9375 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009376}
9377
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009378static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009379fixcapitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009380{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009381 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9382 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9383 const int kind = PyUnicode_KIND(self);
9384 void *data = PyUnicode_DATA(self);
9385 int touched = 0;
9386 Py_UCS4 maxchar = 0;
9387 Py_ssize_t i = 0;
9388 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00009389
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009390 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009391 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009392
9393 ch = PyUnicode_READ(kind, data, i);
9394 if (!Py_UNICODE_ISUPPER(ch)) {
9395 maxchar = Py_UNICODE_TOUPPER(ch);
9396 PyUnicode_WRITE(kind, data, i, maxchar);
9397 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009398 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009399 ++i;
9400 for(; i < len; ++i) {
9401 ch = PyUnicode_READ(kind, data, i);
9402 if (!Py_UNICODE_ISLOWER(ch)) {
9403 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9404 if (lo > maxchar)
9405 maxchar = lo;
9406 PyUnicode_WRITE(kind, data, i, lo);
9407 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009408 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009409 else if (ch > maxchar)
9410 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009411 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009412
9413 if (touched)
9414 return maxchar;
9415 else
9416 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009417}
9418
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009419static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009420fixtitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009421{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009422 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9423 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9424 const int kind = PyUnicode_KIND(self);
9425 void *data = PyUnicode_DATA(self);
9426 Py_UCS4 maxchar = 0;
9427 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009428 int previous_is_cased;
9429
9430 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009431 if (len == 1) {
9432 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9433 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
9434 if (ti != ch) {
9435 PyUnicode_WRITE(kind, data, i, ti);
9436 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00009437 }
9438 else
9439 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009440 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009441 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009442 for(; i < len; ++i) {
9443 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9444 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00009445
Benjamin Peterson29060642009-01-31 22:14:21 +00009446 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009447 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00009448 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009449 nu = Py_UNICODE_TOTITLE(ch);
9450
9451 if (nu > maxchar)
9452 maxchar = nu;
9453 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00009454
Benjamin Peterson29060642009-01-31 22:14:21 +00009455 if (Py_UNICODE_ISLOWER(ch) ||
9456 Py_UNICODE_ISUPPER(ch) ||
9457 Py_UNICODE_ISTITLE(ch))
9458 previous_is_cased = 1;
9459 else
9460 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009461 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009462 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009463}
9464
Tim Peters8ce9f162004-08-27 01:49:32 +00009465PyObject *
9466PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009467{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009468 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009469 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009470 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009471 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009472 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9473 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009474 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009475 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009476 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009477 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009478 int use_memcpy;
9479 unsigned char *res_data = NULL, *sep_data = NULL;
9480 PyObject *last_obj;
9481 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009482
Tim Peters05eba1f2004-08-27 21:32:02 +00009483 fseq = PySequence_Fast(seq, "");
9484 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009485 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009486 }
9487
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009488 /* NOTE: the following code can't call back into Python code,
9489 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009490 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009491
Tim Peters05eba1f2004-08-27 21:32:02 +00009492 seqlen = PySequence_Fast_GET_SIZE(fseq);
9493 /* If empty sequence, return u"". */
9494 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009495 Py_DECREF(fseq);
9496 Py_INCREF(unicode_empty);
9497 res = unicode_empty;
9498 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009499 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009500
Tim Peters05eba1f2004-08-27 21:32:02 +00009501 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009502 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009503 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009504 if (seqlen == 1) {
9505 if (PyUnicode_CheckExact(items[0])) {
9506 res = items[0];
9507 Py_INCREF(res);
9508 Py_DECREF(fseq);
9509 return res;
9510 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009511 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009512 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009513 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009514 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009515 /* Set up sep and seplen */
9516 if (separator == NULL) {
9517 /* fall back to a blank space separator */
9518 sep = PyUnicode_FromOrdinal(' ');
9519 if (!sep)
9520 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009521 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009522 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009523 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009524 else {
9525 if (!PyUnicode_Check(separator)) {
9526 PyErr_Format(PyExc_TypeError,
9527 "separator: expected str instance,"
9528 " %.80s found",
9529 Py_TYPE(separator)->tp_name);
9530 goto onError;
9531 }
9532 if (PyUnicode_READY(separator))
9533 goto onError;
9534 sep = separator;
9535 seplen = PyUnicode_GET_LENGTH(separator);
9536 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9537 /* inc refcount to keep this code path symmetric with the
9538 above case of a blank separator */
9539 Py_INCREF(sep);
9540 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009541 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009542 }
9543
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009544 /* There are at least two things to join, or else we have a subclass
9545 * of str in the sequence.
9546 * Do a pre-pass to figure out the total amount of space we'll
9547 * need (sz), and see whether all argument are strings.
9548 */
9549 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009550#ifdef Py_DEBUG
9551 use_memcpy = 0;
9552#else
9553 use_memcpy = 1;
9554#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009555 for (i = 0; i < seqlen; i++) {
9556 const Py_ssize_t old_sz = sz;
9557 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009558 if (!PyUnicode_Check(item)) {
9559 PyErr_Format(PyExc_TypeError,
9560 "sequence item %zd: expected str instance,"
9561 " %.80s found",
9562 i, Py_TYPE(item)->tp_name);
9563 goto onError;
9564 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009565 if (PyUnicode_READY(item) == -1)
9566 goto onError;
9567 sz += PyUnicode_GET_LENGTH(item);
9568 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009569 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009570 if (i != 0)
9571 sz += seplen;
9572 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9573 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009574 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009575 goto onError;
9576 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009577 if (use_memcpy && last_obj != NULL) {
9578 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9579 use_memcpy = 0;
9580 }
9581 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009582 }
Tim Petersced69f82003-09-16 20:30:58 +00009583
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009584 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009585 if (res == NULL)
9586 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009587
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009588 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009589#ifdef Py_DEBUG
9590 use_memcpy = 0;
9591#else
9592 if (use_memcpy) {
9593 res_data = PyUnicode_1BYTE_DATA(res);
9594 kind = PyUnicode_KIND(res);
9595 if (seplen != 0)
9596 sep_data = PyUnicode_1BYTE_DATA(sep);
9597 }
9598#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009599 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009600 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009601 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009602 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009603 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009604 if (use_memcpy) {
9605 Py_MEMCPY(res_data,
9606 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009607 kind * seplen);
9608 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009609 }
9610 else {
9611 copy_characters(res, res_offset, sep, 0, seplen);
9612 res_offset += seplen;
9613 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009614 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009615 itemlen = PyUnicode_GET_LENGTH(item);
9616 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009617 if (use_memcpy) {
9618 Py_MEMCPY(res_data,
9619 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009620 kind * itemlen);
9621 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009622 }
9623 else {
9624 copy_characters(res, res_offset, item, 0, itemlen);
9625 res_offset += itemlen;
9626 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009627 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009628 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009629 if (use_memcpy)
9630 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009631 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009632 else
9633 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009634
Tim Peters05eba1f2004-08-27 21:32:02 +00009635 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009636 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009637 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009638 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009639
Benjamin Peterson29060642009-01-31 22:14:21 +00009640 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009641 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009642 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009643 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009644 return NULL;
9645}
9646
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009647#define FILL(kind, data, value, start, length) \
9648 do { \
9649 Py_ssize_t i_ = 0; \
9650 assert(kind != PyUnicode_WCHAR_KIND); \
9651 switch ((kind)) { \
9652 case PyUnicode_1BYTE_KIND: { \
9653 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9654 memset(to_, (unsigned char)value, length); \
9655 break; \
9656 } \
9657 case PyUnicode_2BYTE_KIND: { \
9658 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9659 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9660 break; \
9661 } \
9662 default: { \
9663 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9664 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9665 break; \
9666 } \
9667 } \
9668 } while (0)
9669
Victor Stinner9310abb2011-10-05 00:59:23 +02009670static PyObject *
9671pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009672 Py_ssize_t left,
9673 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009674 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009675{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009676 PyObject *u;
9677 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009678 int kind;
9679 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009680
9681 if (left < 0)
9682 left = 0;
9683 if (right < 0)
9684 right = 0;
9685
Victor Stinnerc4b49542011-12-11 22:44:26 +01009686 if (left == 0 && right == 0)
9687 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009688
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009689 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9690 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009691 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9692 return NULL;
9693 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009694 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9695 if (fill > maxchar)
9696 maxchar = fill;
9697 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009698 if (!u)
9699 return NULL;
9700
9701 kind = PyUnicode_KIND(u);
9702 data = PyUnicode_DATA(u);
9703 if (left)
9704 FILL(kind, data, fill, 0, left);
9705 if (right)
9706 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009707 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009708 assert(_PyUnicode_CheckConsistency(u, 1));
9709 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009710}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009711#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009712
Alexander Belopolsky40018472011-02-26 01:02:56 +00009713PyObject *
9714PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009715{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009716 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009717
9718 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009719 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009720 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009721
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009722 switch(PyUnicode_KIND(string)) {
9723 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009724 if (PyUnicode_IS_ASCII(string))
9725 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009726 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009727 PyUnicode_GET_LENGTH(string), keepends);
9728 else
9729 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009730 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009731 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009732 break;
9733 case PyUnicode_2BYTE_KIND:
9734 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009735 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009736 PyUnicode_GET_LENGTH(string), keepends);
9737 break;
9738 case PyUnicode_4BYTE_KIND:
9739 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009740 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009741 PyUnicode_GET_LENGTH(string), keepends);
9742 break;
9743 default:
9744 assert(0);
9745 list = 0;
9746 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009747 Py_DECREF(string);
9748 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009749}
9750
Alexander Belopolsky40018472011-02-26 01:02:56 +00009751static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009752split(PyObject *self,
9753 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009754 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009755{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009756 int kind1, kind2, kind;
9757 void *buf1, *buf2;
9758 Py_ssize_t len1, len2;
9759 PyObject* out;
9760
Guido van Rossumd57fd912000-03-10 22:53:23 +00009761 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009762 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009763
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009764 if (PyUnicode_READY(self) == -1)
9765 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009766
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009767 if (substring == NULL)
9768 switch(PyUnicode_KIND(self)) {
9769 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009770 if (PyUnicode_IS_ASCII(self))
9771 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009772 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009773 PyUnicode_GET_LENGTH(self), maxcount
9774 );
9775 else
9776 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009777 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009778 PyUnicode_GET_LENGTH(self), maxcount
9779 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009780 case PyUnicode_2BYTE_KIND:
9781 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009782 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009783 PyUnicode_GET_LENGTH(self), maxcount
9784 );
9785 case PyUnicode_4BYTE_KIND:
9786 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009787 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009788 PyUnicode_GET_LENGTH(self), maxcount
9789 );
9790 default:
9791 assert(0);
9792 return NULL;
9793 }
9794
9795 if (PyUnicode_READY(substring) == -1)
9796 return NULL;
9797
9798 kind1 = PyUnicode_KIND(self);
9799 kind2 = PyUnicode_KIND(substring);
9800 kind = kind1 > kind2 ? kind1 : kind2;
9801 buf1 = PyUnicode_DATA(self);
9802 buf2 = PyUnicode_DATA(substring);
9803 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009804 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009805 if (!buf1)
9806 return NULL;
9807 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009808 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009809 if (!buf2) {
9810 if (kind1 != kind) PyMem_Free(buf1);
9811 return NULL;
9812 }
9813 len1 = PyUnicode_GET_LENGTH(self);
9814 len2 = PyUnicode_GET_LENGTH(substring);
9815
9816 switch(kind) {
9817 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009818 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9819 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009820 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009821 else
9822 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009823 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009824 break;
9825 case PyUnicode_2BYTE_KIND:
9826 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009827 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009828 break;
9829 case PyUnicode_4BYTE_KIND:
9830 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009831 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009832 break;
9833 default:
9834 out = NULL;
9835 }
9836 if (kind1 != kind)
9837 PyMem_Free(buf1);
9838 if (kind2 != kind)
9839 PyMem_Free(buf2);
9840 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009841}
9842
Alexander Belopolsky40018472011-02-26 01:02:56 +00009843static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009844rsplit(PyObject *self,
9845 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009846 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009847{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009848 int kind1, kind2, kind;
9849 void *buf1, *buf2;
9850 Py_ssize_t len1, len2;
9851 PyObject* out;
9852
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009853 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009854 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009855
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009856 if (PyUnicode_READY(self) == -1)
9857 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009858
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009859 if (substring == NULL)
9860 switch(PyUnicode_KIND(self)) {
9861 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009862 if (PyUnicode_IS_ASCII(self))
9863 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009864 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009865 PyUnicode_GET_LENGTH(self), maxcount
9866 );
9867 else
9868 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009869 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009870 PyUnicode_GET_LENGTH(self), maxcount
9871 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009872 case PyUnicode_2BYTE_KIND:
9873 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009874 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009875 PyUnicode_GET_LENGTH(self), maxcount
9876 );
9877 case PyUnicode_4BYTE_KIND:
9878 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009879 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009880 PyUnicode_GET_LENGTH(self), maxcount
9881 );
9882 default:
9883 assert(0);
9884 return NULL;
9885 }
9886
9887 if (PyUnicode_READY(substring) == -1)
9888 return NULL;
9889
9890 kind1 = PyUnicode_KIND(self);
9891 kind2 = PyUnicode_KIND(substring);
9892 kind = kind1 > kind2 ? kind1 : kind2;
9893 buf1 = PyUnicode_DATA(self);
9894 buf2 = PyUnicode_DATA(substring);
9895 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009896 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009897 if (!buf1)
9898 return NULL;
9899 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009900 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009901 if (!buf2) {
9902 if (kind1 != kind) PyMem_Free(buf1);
9903 return NULL;
9904 }
9905 len1 = PyUnicode_GET_LENGTH(self);
9906 len2 = PyUnicode_GET_LENGTH(substring);
9907
9908 switch(kind) {
9909 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009910 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9911 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009912 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009913 else
9914 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009915 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009916 break;
9917 case PyUnicode_2BYTE_KIND:
9918 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009919 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009920 break;
9921 case PyUnicode_4BYTE_KIND:
9922 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009923 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009924 break;
9925 default:
9926 out = NULL;
9927 }
9928 if (kind1 != kind)
9929 PyMem_Free(buf1);
9930 if (kind2 != kind)
9931 PyMem_Free(buf2);
9932 return out;
9933}
9934
9935static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009936anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9937 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009938{
9939 switch(kind) {
9940 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009941 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9942 return asciilib_find(buf1, len1, buf2, len2, offset);
9943 else
9944 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009945 case PyUnicode_2BYTE_KIND:
9946 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9947 case PyUnicode_4BYTE_KIND:
9948 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9949 }
9950 assert(0);
9951 return -1;
9952}
9953
9954static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009955anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
9956 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009957{
9958 switch(kind) {
9959 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009960 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
9961 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
9962 else
9963 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009964 case PyUnicode_2BYTE_KIND:
9965 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9966 case PyUnicode_4BYTE_KIND:
9967 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9968 }
9969 assert(0);
9970 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009971}
9972
Alexander Belopolsky40018472011-02-26 01:02:56 +00009973static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009974replace(PyObject *self, PyObject *str1,
9975 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009976{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009977 PyObject *u;
9978 char *sbuf = PyUnicode_DATA(self);
9979 char *buf1 = PyUnicode_DATA(str1);
9980 char *buf2 = PyUnicode_DATA(str2);
9981 int srelease = 0, release1 = 0, release2 = 0;
9982 int skind = PyUnicode_KIND(self);
9983 int kind1 = PyUnicode_KIND(str1);
9984 int kind2 = PyUnicode_KIND(str2);
9985 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9986 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9987 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +02009988 int mayshrink;
9989 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009990
9991 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009992 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009993 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009994 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009995
Victor Stinner59de0ee2011-10-07 10:01:28 +02009996 if (str1 == str2)
9997 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009998 if (skind < kind1)
9999 /* substring too wide to be present */
10000 goto nothing;
10001
Victor Stinner49a0a212011-10-12 23:46:10 +020010002 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10003 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10004 /* Replacing str1 with str2 may cause a maxchar reduction in the
10005 result string. */
10006 mayshrink = (maxchar_str2 < maxchar);
10007 maxchar = Py_MAX(maxchar, maxchar_str2);
10008
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010009 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +000010010 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010011 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010012 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010013 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010014 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010015 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010016 Py_UCS4 u1, u2;
10017 int rkind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010018 u1 = PyUnicode_READ_CHAR(str1, 0);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +020010019 if (findchar(sbuf, PyUnicode_KIND(self),
10020 slen, u1, 1) < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010021 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010022 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010023 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010024 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010025 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010026 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010027 rkind = PyUnicode_KIND(u);
10028 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
10029 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010030 if (--maxcount < 0)
10031 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010032 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010033 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010034 }
10035 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010036 int rkind = skind;
10037 char *res;
Victor Stinner25a4b292011-10-06 12:31:55 +020010038
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010039 if (kind1 < rkind) {
10040 /* widen substring */
10041 buf1 = _PyUnicode_AsKind(str1, rkind);
10042 if (!buf1) goto error;
10043 release1 = 1;
10044 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010045 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010046 if (i < 0)
10047 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010048 if (rkind > kind2) {
10049 /* widen replacement */
10050 buf2 = _PyUnicode_AsKind(str2, rkind);
10051 if (!buf2) goto error;
10052 release2 = 1;
10053 }
10054 else if (rkind < kind2) {
10055 /* widen self and buf1 */
10056 rkind = kind2;
10057 if (release1) PyMem_Free(buf1);
10058 sbuf = _PyUnicode_AsKind(self, rkind);
10059 if (!sbuf) goto error;
10060 srelease = 1;
10061 buf1 = _PyUnicode_AsKind(str1, rkind);
10062 if (!buf1) goto error;
10063 release1 = 1;
10064 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010065 u = PyUnicode_New(slen, maxchar);
10066 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010067 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010068 assert(PyUnicode_KIND(u) == rkind);
10069 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010070
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010071 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010072 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010073 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010074 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010075 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010076 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010077
10078 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010079 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010080 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010081 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010082 if (i == -1)
10083 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010084 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010085 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010086 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010087 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010088 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010089 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010090 }
10091 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010092 Py_ssize_t n, i, j, ires;
10093 Py_ssize_t product, new_size;
10094 int rkind = skind;
10095 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010096
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010097 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010098 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010099 buf1 = _PyUnicode_AsKind(str1, rkind);
10100 if (!buf1) goto error;
10101 release1 = 1;
10102 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010103 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010104 if (n == 0)
10105 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010106 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010107 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010108 buf2 = _PyUnicode_AsKind(str2, rkind);
10109 if (!buf2) goto error;
10110 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010111 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010112 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010113 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010114 rkind = kind2;
10115 sbuf = _PyUnicode_AsKind(self, rkind);
10116 if (!sbuf) goto error;
10117 srelease = 1;
10118 if (release1) PyMem_Free(buf1);
10119 buf1 = _PyUnicode_AsKind(str1, rkind);
10120 if (!buf1) goto error;
10121 release1 = 1;
10122 }
10123 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10124 PyUnicode_GET_LENGTH(str1))); */
10125 product = n * (len2-len1);
10126 if ((product / (len2-len1)) != n) {
10127 PyErr_SetString(PyExc_OverflowError,
10128 "replace string is too long");
10129 goto error;
10130 }
10131 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +020010132 if (new_size == 0) {
10133 Py_INCREF(unicode_empty);
10134 u = unicode_empty;
10135 goto done;
10136 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010137 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10138 PyErr_SetString(PyExc_OverflowError,
10139 "replace string is too long");
10140 goto error;
10141 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010142 u = PyUnicode_New(new_size, maxchar);
10143 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010144 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010145 assert(PyUnicode_KIND(u) == rkind);
10146 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010147 ires = i = 0;
10148 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010149 while (n-- > 0) {
10150 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010151 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010152 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010153 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010154 if (j == -1)
10155 break;
10156 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010157 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010158 memcpy(res + rkind * ires,
10159 sbuf + rkind * i,
10160 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010161 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010162 }
10163 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010164 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010165 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010166 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010167 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010168 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010169 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010170 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010171 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010172 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010173 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010174 memcpy(res + rkind * ires,
10175 sbuf + rkind * i,
10176 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010177 }
10178 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010179 /* interleave */
10180 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010181 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010182 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010183 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010184 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010185 if (--n <= 0)
10186 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010187 memcpy(res + rkind * ires,
10188 sbuf + rkind * i,
10189 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010190 ires++;
10191 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010192 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010193 memcpy(res + rkind * ires,
10194 sbuf + rkind * i,
10195 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010196 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010197 }
10198
10199 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010200 unicode_adjust_maxchar(&u);
10201 if (u == NULL)
10202 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010203 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010204
10205 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010206 if (srelease)
10207 PyMem_FREE(sbuf);
10208 if (release1)
10209 PyMem_FREE(buf1);
10210 if (release2)
10211 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010212 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010213 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010214
Benjamin Peterson29060642009-01-31 22:14:21 +000010215 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010216 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010217 if (srelease)
10218 PyMem_FREE(sbuf);
10219 if (release1)
10220 PyMem_FREE(buf1);
10221 if (release2)
10222 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010223 return unicode_result_unchanged(self);
10224
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010225 error:
10226 if (srelease && sbuf)
10227 PyMem_FREE(sbuf);
10228 if (release1 && buf1)
10229 PyMem_FREE(buf1);
10230 if (release2 && buf2)
10231 PyMem_FREE(buf2);
10232 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010233}
10234
10235/* --- Unicode Object Methods --------------------------------------------- */
10236
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010237PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010238 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010239\n\
10240Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010241characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010242
10243static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010244unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010245{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010246 return fixup(self, fixtitle);
10247}
10248
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010249PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010250 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010251\n\
10252Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010253have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010254
10255static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010256unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010257{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010258 return fixup(self, fixcapitalize);
10259}
10260
10261#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010262PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010263 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010264\n\
10265Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010266normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010267
10268static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010269unicode_capwords(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010270{
10271 PyObject *list;
10272 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010273 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010274
Guido van Rossumd57fd912000-03-10 22:53:23 +000010275 /* Split into words */
10276 list = split(self, NULL, -1);
10277 if (!list)
10278 return NULL;
10279
10280 /* Capitalize each word */
10281 for (i = 0; i < PyList_GET_SIZE(list); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010282 item = fixup(PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +000010283 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010284 if (item == NULL)
10285 goto onError;
10286 Py_DECREF(PyList_GET_ITEM(list, i));
10287 PyList_SET_ITEM(list, i, item);
10288 }
10289
10290 /* Join the words to form a new string */
10291 item = PyUnicode_Join(NULL, list);
10292
Benjamin Peterson29060642009-01-31 22:14:21 +000010293 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010294 Py_DECREF(list);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010295 return item;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010296}
10297#endif
10298
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010299/* Argument converter. Coerces to a single unicode character */
10300
10301static int
10302convert_uc(PyObject *obj, void *addr)
10303{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010304 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010305 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010306
Benjamin Peterson14339b62009-01-31 16:36:08 +000010307 uniobj = PyUnicode_FromObject(obj);
10308 if (uniobj == NULL) {
10309 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010310 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010311 return 0;
10312 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010313 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010314 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010315 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010316 Py_DECREF(uniobj);
10317 return 0;
10318 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010319 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010320 Py_DECREF(uniobj);
10321 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010322}
10323
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010324PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010325 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010326\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010327Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010328done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010329
10330static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010331unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010332{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010333 Py_ssize_t marg, left;
10334 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010335 Py_UCS4 fillchar = ' ';
10336
Victor Stinnere9a29352011-10-01 02:14:59 +020010337 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010338 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010339
Victor Stinnerc4b49542011-12-11 22:44:26 +010010340 if (PyUnicode_READY(self) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010341 return NULL;
10342
Victor Stinnerc4b49542011-12-11 22:44:26 +010010343 if (PyUnicode_GET_LENGTH(self) >= width)
10344 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010345
Victor Stinnerc4b49542011-12-11 22:44:26 +010010346 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010347 left = marg / 2 + (marg & width & 1);
10348
Victor Stinner9310abb2011-10-05 00:59:23 +020010349 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010350}
10351
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010352/* This function assumes that str1 and str2 are readied by the caller. */
10353
Marc-André Lemburge5034372000-08-08 08:04:29 +000010354static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010355unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010356{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010357 int kind1, kind2;
10358 void *data1, *data2;
10359 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010360
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010361 kind1 = PyUnicode_KIND(str1);
10362 kind2 = PyUnicode_KIND(str2);
10363 data1 = PyUnicode_DATA(str1);
10364 data2 = PyUnicode_DATA(str2);
10365 len1 = PyUnicode_GET_LENGTH(str1);
10366 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010367
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010368 for (i = 0; i < len1 && i < len2; ++i) {
10369 Py_UCS4 c1, c2;
10370 c1 = PyUnicode_READ(kind1, data1, i);
10371 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010372
10373 if (c1 != c2)
10374 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010375 }
10376
10377 return (len1 < len2) ? -1 : (len1 != len2);
10378}
10379
Alexander Belopolsky40018472011-02-26 01:02:56 +000010380int
10381PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010382{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010383 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10384 if (PyUnicode_READY(left) == -1 ||
10385 PyUnicode_READY(right) == -1)
10386 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010387 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010388 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010389 PyErr_Format(PyExc_TypeError,
10390 "Can't compare %.100s and %.100s",
10391 left->ob_type->tp_name,
10392 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010393 return -1;
10394}
10395
Martin v. Löwis5b222132007-06-10 09:51:05 +000010396int
10397PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10398{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010399 Py_ssize_t i;
10400 int kind;
10401 void *data;
10402 Py_UCS4 chr;
10403
Victor Stinner910337b2011-10-03 03:20:16 +020010404 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010405 if (PyUnicode_READY(uni) == -1)
10406 return -1;
10407 kind = PyUnicode_KIND(uni);
10408 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010409 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010410 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10411 if (chr != str[i])
10412 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010413 /* This check keeps Python strings that end in '\0' from comparing equal
10414 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010415 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010416 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010417 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010418 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010419 return 0;
10420}
10421
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010422
Benjamin Peterson29060642009-01-31 22:14:21 +000010423#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010424 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010425
Alexander Belopolsky40018472011-02-26 01:02:56 +000010426PyObject *
10427PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010428{
10429 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010430
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010431 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10432 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010433 if (PyUnicode_READY(left) == -1 ||
10434 PyUnicode_READY(right) == -1)
10435 return NULL;
10436 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10437 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010438 if (op == Py_EQ) {
10439 Py_INCREF(Py_False);
10440 return Py_False;
10441 }
10442 if (op == Py_NE) {
10443 Py_INCREF(Py_True);
10444 return Py_True;
10445 }
10446 }
10447 if (left == right)
10448 result = 0;
10449 else
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010450 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010451
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010452 /* Convert the return value to a Boolean */
10453 switch (op) {
10454 case Py_EQ:
10455 v = TEST_COND(result == 0);
10456 break;
10457 case Py_NE:
10458 v = TEST_COND(result != 0);
10459 break;
10460 case Py_LE:
10461 v = TEST_COND(result <= 0);
10462 break;
10463 case Py_GE:
10464 v = TEST_COND(result >= 0);
10465 break;
10466 case Py_LT:
10467 v = TEST_COND(result == -1);
10468 break;
10469 case Py_GT:
10470 v = TEST_COND(result == 1);
10471 break;
10472 default:
10473 PyErr_BadArgument();
10474 return NULL;
10475 }
10476 Py_INCREF(v);
10477 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010478 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010479
Brian Curtindfc80e32011-08-10 20:28:54 -050010480 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010481}
10482
Alexander Belopolsky40018472011-02-26 01:02:56 +000010483int
10484PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010485{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010486 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010487 int kind1, kind2, kind;
10488 void *buf1, *buf2;
10489 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010490 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010491
10492 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010493 sub = PyUnicode_FromObject(element);
10494 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010495 PyErr_Format(PyExc_TypeError,
10496 "'in <string>' requires string as left operand, not %s",
10497 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010498 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010499 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010500 if (PyUnicode_READY(sub) == -1)
10501 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010502
Thomas Wouters477c8d52006-05-27 19:21:47 +000010503 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +020010504 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010505 Py_DECREF(sub);
10506 return -1;
10507 }
10508
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010509 kind1 = PyUnicode_KIND(str);
10510 kind2 = PyUnicode_KIND(sub);
10511 kind = kind1 > kind2 ? kind1 : kind2;
10512 buf1 = PyUnicode_DATA(str);
10513 buf2 = PyUnicode_DATA(sub);
10514 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010515 buf1 = _PyUnicode_AsKind(str, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010516 if (!buf1) {
10517 Py_DECREF(sub);
10518 return -1;
10519 }
10520 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010521 buf2 = _PyUnicode_AsKind(sub, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010522 if (!buf2) {
10523 Py_DECREF(sub);
10524 if (kind1 != kind) PyMem_Free(buf1);
10525 return -1;
10526 }
10527 len1 = PyUnicode_GET_LENGTH(str);
10528 len2 = PyUnicode_GET_LENGTH(sub);
10529
10530 switch(kind) {
10531 case PyUnicode_1BYTE_KIND:
10532 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10533 break;
10534 case PyUnicode_2BYTE_KIND:
10535 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10536 break;
10537 case PyUnicode_4BYTE_KIND:
10538 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10539 break;
10540 default:
10541 result = -1;
10542 assert(0);
10543 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010544
10545 Py_DECREF(str);
10546 Py_DECREF(sub);
10547
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010548 if (kind1 != kind)
10549 PyMem_Free(buf1);
10550 if (kind2 != kind)
10551 PyMem_Free(buf2);
10552
Guido van Rossum403d68b2000-03-13 15:55:09 +000010553 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010554}
10555
Guido van Rossumd57fd912000-03-10 22:53:23 +000010556/* Concat to string or Unicode object giving a new Unicode object. */
10557
Alexander Belopolsky40018472011-02-26 01:02:56 +000010558PyObject *
10559PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010560{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010561 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010562 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010563 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010564
10565 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010566 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010567 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010568 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010569 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010570 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010571 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010572
10573 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010574 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010575 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010576 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010577 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010578 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010579 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010580 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010581 }
10582
Victor Stinner488fa492011-12-12 00:01:39 +010010583 u_len = PyUnicode_GET_LENGTH(u);
10584 v_len = PyUnicode_GET_LENGTH(v);
10585 if (u_len > PY_SSIZE_T_MAX - v_len) {
10586 PyErr_SetString(PyExc_OverflowError,
10587 "strings are too large to concat");
10588 goto onError;
10589 }
10590 new_len = u_len + v_len;
10591
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010592 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010593 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
10594 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010595
Guido van Rossumd57fd912000-03-10 22:53:23 +000010596 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010010597 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010598 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010599 goto onError;
Victor Stinner488fa492011-12-12 00:01:39 +010010600 copy_characters(w, 0, u, 0, u_len);
10601 copy_characters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010602 Py_DECREF(u);
10603 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010604 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010605 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010606
Benjamin Peterson29060642009-01-31 22:14:21 +000010607 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010608 Py_XDECREF(u);
10609 Py_XDECREF(v);
10610 return NULL;
10611}
10612
Walter Dörwald1ab83302007-05-18 17:15:44 +000010613void
Victor Stinner23e56682011-10-03 03:54:37 +020010614PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010615{
Victor Stinner23e56682011-10-03 03:54:37 +020010616 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010010617 Py_UCS4 maxchar, maxchar2;
10618 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020010619
10620 if (p_left == NULL) {
10621 if (!PyErr_Occurred())
10622 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010623 return;
10624 }
Victor Stinner23e56682011-10-03 03:54:37 +020010625 left = *p_left;
10626 if (right == NULL || !PyUnicode_Check(left)) {
10627 if (!PyErr_Occurred())
10628 PyErr_BadInternalCall();
10629 goto error;
10630 }
10631
Victor Stinnere1335c72011-10-04 20:53:03 +020010632 if (PyUnicode_READY(left))
10633 goto error;
10634 if (PyUnicode_READY(right))
10635 goto error;
10636
Victor Stinner488fa492011-12-12 00:01:39 +010010637 /* Shortcuts */
10638 if (left == unicode_empty) {
10639 Py_DECREF(left);
10640 Py_INCREF(right);
10641 *p_left = right;
10642 return;
10643 }
10644 if (right == unicode_empty)
10645 return;
10646
10647 left_len = PyUnicode_GET_LENGTH(left);
10648 right_len = PyUnicode_GET_LENGTH(right);
10649 if (left_len > PY_SSIZE_T_MAX - right_len) {
10650 PyErr_SetString(PyExc_OverflowError,
10651 "strings are too large to concat");
10652 goto error;
10653 }
10654 new_len = left_len + right_len;
10655
10656 if (unicode_modifiable(left)
10657 && PyUnicode_CheckExact(right)
10658 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020010659 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10660 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010661 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010662 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010010663 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10664 {
10665 /* append inplace */
10666 if (unicode_resize(p_left, new_len) != 0) {
10667 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10668 * deallocated so it cannot be put back into
10669 * 'variable'. The MemoryError is raised when there
10670 * is no value in 'variable', which might (very
10671 * remotely) be a cause of incompatibilities.
10672 */
10673 goto error;
Victor Stinner23e56682011-10-03 03:54:37 +020010674 }
Victor Stinner488fa492011-12-12 00:01:39 +010010675 /* copy 'right' into the newly allocated area of 'left' */
10676 copy_characters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020010677 }
Victor Stinner488fa492011-12-12 00:01:39 +010010678 else {
10679 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
10680 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
10681 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020010682
Victor Stinner488fa492011-12-12 00:01:39 +010010683 /* Concat the two Unicode strings */
10684 res = PyUnicode_New(new_len, maxchar);
10685 if (res == NULL)
10686 goto error;
10687 copy_characters(res, 0, left, 0, left_len);
10688 copy_characters(res, left_len, right, 0, right_len);
10689 Py_DECREF(left);
10690 *p_left = res;
10691 }
10692 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010693 return;
10694
10695error:
Victor Stinner488fa492011-12-12 00:01:39 +010010696 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010697}
10698
10699void
10700PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10701{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010702 PyUnicode_Append(pleft, right);
10703 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010704}
10705
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010706PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010707 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010708\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010709Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010710string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010711interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010712
10713static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010714unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010715{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010716 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010717 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010718 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010719 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010720 int kind1, kind2, kind;
10721 void *buf1, *buf2;
10722 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010723
Jesus Ceaac451502011-04-20 17:09:23 +020010724 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10725 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010726 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010727
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010728 kind1 = PyUnicode_KIND(self);
10729 kind2 = PyUnicode_KIND(substring);
10730 kind = kind1 > kind2 ? kind1 : kind2;
10731 buf1 = PyUnicode_DATA(self);
10732 buf2 = PyUnicode_DATA(substring);
10733 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010734 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010735 if (!buf1) {
10736 Py_DECREF(substring);
10737 return NULL;
10738 }
10739 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010740 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010741 if (!buf2) {
10742 Py_DECREF(substring);
10743 if (kind1 != kind) PyMem_Free(buf1);
10744 return NULL;
10745 }
10746 len1 = PyUnicode_GET_LENGTH(self);
10747 len2 = PyUnicode_GET_LENGTH(substring);
10748
10749 ADJUST_INDICES(start, end, len1);
10750 switch(kind) {
10751 case PyUnicode_1BYTE_KIND:
10752 iresult = ucs1lib_count(
10753 ((Py_UCS1*)buf1) + start, end - start,
10754 buf2, len2, PY_SSIZE_T_MAX
10755 );
10756 break;
10757 case PyUnicode_2BYTE_KIND:
10758 iresult = ucs2lib_count(
10759 ((Py_UCS2*)buf1) + start, end - start,
10760 buf2, len2, PY_SSIZE_T_MAX
10761 );
10762 break;
10763 case PyUnicode_4BYTE_KIND:
10764 iresult = ucs4lib_count(
10765 ((Py_UCS4*)buf1) + start, end - start,
10766 buf2, len2, PY_SSIZE_T_MAX
10767 );
10768 break;
10769 default:
10770 assert(0); iresult = 0;
10771 }
10772
10773 result = PyLong_FromSsize_t(iresult);
10774
10775 if (kind1 != kind)
10776 PyMem_Free(buf1);
10777 if (kind2 != kind)
10778 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010779
10780 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010781
Guido van Rossumd57fd912000-03-10 22:53:23 +000010782 return result;
10783}
10784
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010785PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010786 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010787\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010788Encode S using the codec registered for encoding. Default encoding\n\
10789is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010790handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010791a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10792'xmlcharrefreplace' as well as any other name registered with\n\
10793codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010794
10795static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010796unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010797{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010798 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010799 char *encoding = NULL;
10800 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010801
Benjamin Peterson308d6372009-09-18 21:42:35 +000010802 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10803 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010804 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010805 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010806}
10807
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010808PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010809 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010810\n\
10811Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010812If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010813
10814static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010815unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010816{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010817 Py_ssize_t i, j, line_pos, src_len, incr;
10818 Py_UCS4 ch;
10819 PyObject *u;
10820 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010821 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010822 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010823 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010824
10825 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010826 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010827
Antoine Pitrou22425222011-10-04 19:10:51 +020010828 if (PyUnicode_READY(self) == -1)
10829 return NULL;
10830
Thomas Wouters7e474022000-07-16 12:04:32 +000010831 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010832 src_len = PyUnicode_GET_LENGTH(self);
10833 i = j = line_pos = 0;
10834 kind = PyUnicode_KIND(self);
10835 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010836 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010837 for (; i < src_len; i++) {
10838 ch = PyUnicode_READ(kind, src_data, i);
10839 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010840 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010841 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010842 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010843 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010844 goto overflow;
10845 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010846 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010847 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010848 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010849 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010850 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010851 goto overflow;
10852 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010853 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010854 if (ch == '\n' || ch == '\r')
10855 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010856 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010857 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010010858 if (!found)
10859 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010860
Guido van Rossumd57fd912000-03-10 22:53:23 +000010861 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010862 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010863 if (!u)
10864 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010865 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010866
Antoine Pitroue71d5742011-10-04 15:55:09 +020010867 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010868
Antoine Pitroue71d5742011-10-04 15:55:09 +020010869 for (; i < src_len; i++) {
10870 ch = PyUnicode_READ(kind, src_data, i);
10871 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010872 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010873 incr = tabsize - (line_pos % tabsize);
10874 line_pos += incr;
10875 while (incr--) {
10876 PyUnicode_WRITE(kind, dest_data, j, ' ');
10877 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010878 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010879 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010880 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010881 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010882 line_pos++;
10883 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010884 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010885 if (ch == '\n' || ch == '\r')
10886 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010887 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010888 }
10889 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010010890 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010891
Antoine Pitroue71d5742011-10-04 15:55:09 +020010892 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010893 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10894 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010895}
10896
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010897PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010898 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010899\n\
10900Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010901such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010902arguments start and end are interpreted as in slice notation.\n\
10903\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010904Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010905
10906static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010907unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010908{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010909 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010910 Py_ssize_t start;
10911 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010912 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010913
Jesus Ceaac451502011-04-20 17:09:23 +020010914 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10915 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010916 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010917
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010918 if (PyUnicode_READY(self) == -1)
10919 return NULL;
10920 if (PyUnicode_READY(substring) == -1)
10921 return NULL;
10922
Victor Stinner7931d9a2011-11-04 00:22:48 +010010923 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010924
10925 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010926
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010927 if (result == -2)
10928 return NULL;
10929
Christian Heimes217cfd12007-12-02 14:31:20 +000010930 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010931}
10932
10933static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010934unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010935{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010936 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10937 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010938 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010939 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010940}
10941
Guido van Rossumc2504932007-09-18 19:42:40 +000010942/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010943 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010944static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010945unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010946{
Guido van Rossumc2504932007-09-18 19:42:40 +000010947 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010010948 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010949
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010950 if (_PyUnicode_HASH(self) != -1)
10951 return _PyUnicode_HASH(self);
10952 if (PyUnicode_READY(self) == -1)
10953 return -1;
10954 len = PyUnicode_GET_LENGTH(self);
10955
10956 /* The hash function as a macro, gets expanded three times below. */
10957#define HASH(P) \
10958 x = (Py_uhash_t)*P << 7; \
10959 while (--len >= 0) \
10960 x = (1000003*x) ^ (Py_uhash_t)*P++;
10961
10962 switch (PyUnicode_KIND(self)) {
10963 case PyUnicode_1BYTE_KIND: {
10964 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10965 HASH(c);
10966 break;
10967 }
10968 case PyUnicode_2BYTE_KIND: {
10969 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10970 HASH(s);
10971 break;
10972 }
10973 default: {
10974 Py_UCS4 *l;
10975 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10976 "Impossible switch case in unicode_hash");
10977 l = PyUnicode_4BYTE_DATA(self);
10978 HASH(l);
10979 break;
10980 }
10981 }
10982 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
10983
Guido van Rossumc2504932007-09-18 19:42:40 +000010984 if (x == -1)
10985 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010986 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010987 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010988}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010989#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000010990
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010991PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010992 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010993\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010994Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010995
10996static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010997unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010998{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010999 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011000 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011001 Py_ssize_t start;
11002 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011003
Jesus Ceaac451502011-04-20 17:09:23 +020011004 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11005 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011006 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011007
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011008 if (PyUnicode_READY(self) == -1)
11009 return NULL;
11010 if (PyUnicode_READY(substring) == -1)
11011 return NULL;
11012
Victor Stinner7931d9a2011-11-04 00:22:48 +010011013 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011014
11015 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011016
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011017 if (result == -2)
11018 return NULL;
11019
Guido van Rossumd57fd912000-03-10 22:53:23 +000011020 if (result < 0) {
11021 PyErr_SetString(PyExc_ValueError, "substring not found");
11022 return NULL;
11023 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011024
Christian Heimes217cfd12007-12-02 14:31:20 +000011025 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011026}
11027
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011028PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011029 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011030\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011031Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011032at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011033
11034static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011035unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011036{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011037 Py_ssize_t i, length;
11038 int kind;
11039 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011040 int cased;
11041
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011042 if (PyUnicode_READY(self) == -1)
11043 return NULL;
11044 length = PyUnicode_GET_LENGTH(self);
11045 kind = PyUnicode_KIND(self);
11046 data = PyUnicode_DATA(self);
11047
Guido van Rossumd57fd912000-03-10 22:53:23 +000011048 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011049 if (length == 1)
11050 return PyBool_FromLong(
11051 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011052
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011053 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011054 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011055 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011056
Guido van Rossumd57fd912000-03-10 22:53:23 +000011057 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011058 for (i = 0; i < length; i++) {
11059 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011060
Benjamin Peterson29060642009-01-31 22:14:21 +000011061 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11062 return PyBool_FromLong(0);
11063 else if (!cased && Py_UNICODE_ISLOWER(ch))
11064 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011065 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011066 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011067}
11068
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011069PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011070 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011071\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011072Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011073at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011074
11075static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011076unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011077{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011078 Py_ssize_t i, length;
11079 int kind;
11080 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011081 int cased;
11082
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011083 if (PyUnicode_READY(self) == -1)
11084 return NULL;
11085 length = PyUnicode_GET_LENGTH(self);
11086 kind = PyUnicode_KIND(self);
11087 data = PyUnicode_DATA(self);
11088
Guido van Rossumd57fd912000-03-10 22:53:23 +000011089 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011090 if (length == 1)
11091 return PyBool_FromLong(
11092 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011093
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011094 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011095 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011096 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011097
Guido van Rossumd57fd912000-03-10 22:53:23 +000011098 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011099 for (i = 0; i < length; i++) {
11100 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011101
Benjamin Peterson29060642009-01-31 22:14:21 +000011102 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11103 return PyBool_FromLong(0);
11104 else if (!cased && Py_UNICODE_ISUPPER(ch))
11105 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011106 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011107 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011108}
11109
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011110PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011111 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011112\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011113Return True if S is a titlecased string and there is at least one\n\
11114character in S, i.e. upper- and titlecase characters may only\n\
11115follow uncased characters and lowercase characters only cased ones.\n\
11116Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011117
11118static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011119unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011120{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011121 Py_ssize_t i, length;
11122 int kind;
11123 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011124 int cased, previous_is_cased;
11125
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011126 if (PyUnicode_READY(self) == -1)
11127 return NULL;
11128 length = PyUnicode_GET_LENGTH(self);
11129 kind = PyUnicode_KIND(self);
11130 data = PyUnicode_DATA(self);
11131
Guido van Rossumd57fd912000-03-10 22:53:23 +000011132 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011133 if (length == 1) {
11134 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11135 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11136 (Py_UNICODE_ISUPPER(ch) != 0));
11137 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011138
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011139 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011140 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011141 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011142
Guido van Rossumd57fd912000-03-10 22:53:23 +000011143 cased = 0;
11144 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011145 for (i = 0; i < length; i++) {
11146 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011147
Benjamin Peterson29060642009-01-31 22:14:21 +000011148 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11149 if (previous_is_cased)
11150 return PyBool_FromLong(0);
11151 previous_is_cased = 1;
11152 cased = 1;
11153 }
11154 else if (Py_UNICODE_ISLOWER(ch)) {
11155 if (!previous_is_cased)
11156 return PyBool_FromLong(0);
11157 previous_is_cased = 1;
11158 cased = 1;
11159 }
11160 else
11161 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011162 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011163 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011164}
11165
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011166PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011167 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011168\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011169Return True if all characters in S are whitespace\n\
11170and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011171
11172static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011173unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011174{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011175 Py_ssize_t i, length;
11176 int kind;
11177 void *data;
11178
11179 if (PyUnicode_READY(self) == -1)
11180 return NULL;
11181 length = PyUnicode_GET_LENGTH(self);
11182 kind = PyUnicode_KIND(self);
11183 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011184
Guido van Rossumd57fd912000-03-10 22:53:23 +000011185 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011186 if (length == 1)
11187 return PyBool_FromLong(
11188 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011189
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011190 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011191 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011192 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011193
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011194 for (i = 0; i < length; i++) {
11195 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011196 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011197 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011198 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011199 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011200}
11201
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011202PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011203 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011204\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011205Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011206and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011207
11208static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011209unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011210{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011211 Py_ssize_t i, length;
11212 int kind;
11213 void *data;
11214
11215 if (PyUnicode_READY(self) == -1)
11216 return NULL;
11217 length = PyUnicode_GET_LENGTH(self);
11218 kind = PyUnicode_KIND(self);
11219 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011220
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011221 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011222 if (length == 1)
11223 return PyBool_FromLong(
11224 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011225
11226 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011227 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011228 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011229
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011230 for (i = 0; i < length; i++) {
11231 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011232 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011233 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011234 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011235}
11236
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011237PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011238 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011239\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011240Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011241and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011242
11243static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011244unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011245{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011246 int kind;
11247 void *data;
11248 Py_ssize_t len, i;
11249
11250 if (PyUnicode_READY(self) == -1)
11251 return NULL;
11252
11253 kind = PyUnicode_KIND(self);
11254 data = PyUnicode_DATA(self);
11255 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011256
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011257 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011258 if (len == 1) {
11259 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11260 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11261 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011262
11263 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011264 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011265 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011266
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011267 for (i = 0; i < len; i++) {
11268 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011269 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011270 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011271 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011272 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011273}
11274
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011275PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011276 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011277\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011278Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011279False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011280
11281static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011282unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011283{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011284 Py_ssize_t i, length;
11285 int kind;
11286 void *data;
11287
11288 if (PyUnicode_READY(self) == -1)
11289 return NULL;
11290 length = PyUnicode_GET_LENGTH(self);
11291 kind = PyUnicode_KIND(self);
11292 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011293
Guido van Rossumd57fd912000-03-10 22:53:23 +000011294 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011295 if (length == 1)
11296 return PyBool_FromLong(
11297 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011298
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011299 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011300 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011301 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011302
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011303 for (i = 0; i < length; i++) {
11304 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011305 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011306 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011307 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011308}
11309
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011310PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011311 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011312\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011313Return True if all characters in S are digits\n\
11314and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011315
11316static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011317unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011318{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011319 Py_ssize_t i, length;
11320 int kind;
11321 void *data;
11322
11323 if (PyUnicode_READY(self) == -1)
11324 return NULL;
11325 length = PyUnicode_GET_LENGTH(self);
11326 kind = PyUnicode_KIND(self);
11327 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011328
Guido van Rossumd57fd912000-03-10 22:53:23 +000011329 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011330 if (length == 1) {
11331 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11332 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11333 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011334
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011335 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011336 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011337 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011338
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011339 for (i = 0; i < length; i++) {
11340 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011341 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011342 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011343 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011344}
11345
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011346PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011347 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011348\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011349Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011350False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011351
11352static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011353unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011354{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011355 Py_ssize_t i, length;
11356 int kind;
11357 void *data;
11358
11359 if (PyUnicode_READY(self) == -1)
11360 return NULL;
11361 length = PyUnicode_GET_LENGTH(self);
11362 kind = PyUnicode_KIND(self);
11363 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011364
Guido van Rossumd57fd912000-03-10 22:53:23 +000011365 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011366 if (length == 1)
11367 return PyBool_FromLong(
11368 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011369
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011370 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011371 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011372 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011373
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011374 for (i = 0; i < length; i++) {
11375 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011376 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011377 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011378 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011379}
11380
Martin v. Löwis47383402007-08-15 07:32:56 +000011381int
11382PyUnicode_IsIdentifier(PyObject *self)
11383{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011384 int kind;
11385 void *data;
11386 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011387 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011388
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011389 if (PyUnicode_READY(self) == -1) {
11390 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011391 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011392 }
11393
11394 /* Special case for empty strings */
11395 if (PyUnicode_GET_LENGTH(self) == 0)
11396 return 0;
11397 kind = PyUnicode_KIND(self);
11398 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011399
11400 /* PEP 3131 says that the first character must be in
11401 XID_Start and subsequent characters in XID_Continue,
11402 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011403 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011404 letters, digits, underscore). However, given the current
11405 definition of XID_Start and XID_Continue, it is sufficient
11406 to check just for these, except that _ must be allowed
11407 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011408 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011409 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011410 return 0;
11411
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011412 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011413 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011414 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011415 return 1;
11416}
11417
11418PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011419 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011420\n\
11421Return True if S is a valid identifier according\n\
11422to the language definition.");
11423
11424static PyObject*
11425unicode_isidentifier(PyObject *self)
11426{
11427 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11428}
11429
Georg Brandl559e5d72008-06-11 18:37:52 +000011430PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011431 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011432\n\
11433Return True if all characters in S are considered\n\
11434printable in repr() or S is empty, False otherwise.");
11435
11436static PyObject*
11437unicode_isprintable(PyObject *self)
11438{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011439 Py_ssize_t i, length;
11440 int kind;
11441 void *data;
11442
11443 if (PyUnicode_READY(self) == -1)
11444 return NULL;
11445 length = PyUnicode_GET_LENGTH(self);
11446 kind = PyUnicode_KIND(self);
11447 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011448
11449 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011450 if (length == 1)
11451 return PyBool_FromLong(
11452 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011453
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011454 for (i = 0; i < length; i++) {
11455 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011456 Py_RETURN_FALSE;
11457 }
11458 }
11459 Py_RETURN_TRUE;
11460}
11461
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011462PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011463 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011464\n\
11465Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011466iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011467
11468static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011469unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011470{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011471 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011472}
11473
Martin v. Löwis18e16552006-02-15 17:27:45 +000011474static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011475unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011476{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011477 if (PyUnicode_READY(self) == -1)
11478 return -1;
11479 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011480}
11481
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011482PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011483 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011484\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011485Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011486done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011487
11488static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011489unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011490{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011491 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011492 Py_UCS4 fillchar = ' ';
11493
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011494 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011495 return NULL;
11496
Victor Stinnerc4b49542011-12-11 22:44:26 +010011497 if (PyUnicode_READY(self) < 0)
11498 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011499
Victor Stinnerc4b49542011-12-11 22:44:26 +010011500 if (PyUnicode_GET_LENGTH(self) >= width)
11501 return unicode_result_unchanged(self);
11502
11503 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011504}
11505
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011506PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011507 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011508\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011509Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011510
11511static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011512unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011513{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011514 return fixup(self, fixlower);
11515}
11516
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011517#define LEFTSTRIP 0
11518#define RIGHTSTRIP 1
11519#define BOTHSTRIP 2
11520
11521/* Arrays indexed by above */
11522static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11523
11524#define STRIPNAME(i) (stripformat[i]+3)
11525
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011526/* externally visible for str.strip(unicode) */
11527PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011528_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011529{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011530 void *data;
11531 int kind;
11532 Py_ssize_t i, j, len;
11533 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011534
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011535 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11536 return NULL;
11537
11538 kind = PyUnicode_KIND(self);
11539 data = PyUnicode_DATA(self);
11540 len = PyUnicode_GET_LENGTH(self);
11541 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11542 PyUnicode_DATA(sepobj),
11543 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011544
Benjamin Peterson14339b62009-01-31 16:36:08 +000011545 i = 0;
11546 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011547 while (i < len &&
11548 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011549 i++;
11550 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011551 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011552
Benjamin Peterson14339b62009-01-31 16:36:08 +000011553 j = len;
11554 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011555 do {
11556 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011557 } while (j >= i &&
11558 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011559 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011560 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011561
Victor Stinner7931d9a2011-11-04 00:22:48 +010011562 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011563}
11564
11565PyObject*
11566PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11567{
11568 unsigned char *data;
11569 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011570 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011571
Victor Stinnerde636f32011-10-01 03:55:54 +020011572 if (PyUnicode_READY(self) == -1)
11573 return NULL;
11574
11575 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11576
Victor Stinner12bab6d2011-10-01 01:53:49 +020011577 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Victor Stinnerc4b49542011-12-11 22:44:26 +010011578 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011579
Victor Stinner12bab6d2011-10-01 01:53:49 +020011580 length = end - start;
11581 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011582 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011583
Victor Stinnerde636f32011-10-01 03:55:54 +020011584 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011585 PyErr_SetString(PyExc_IndexError, "string index out of range");
11586 return NULL;
11587 }
11588
Victor Stinnerb9275c12011-10-05 14:01:42 +020011589 if (PyUnicode_IS_ASCII(self)) {
11590 kind = PyUnicode_KIND(self);
11591 data = PyUnicode_1BYTE_DATA(self);
11592 return unicode_fromascii(data + start, length);
11593 }
11594 else {
11595 kind = PyUnicode_KIND(self);
11596 data = PyUnicode_1BYTE_DATA(self);
11597 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011598 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011599 length);
11600 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011601}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011602
11603static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011604do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011605{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011606 int kind;
11607 void *data;
11608 Py_ssize_t len, i, j;
11609
11610 if (PyUnicode_READY(self) == -1)
11611 return NULL;
11612
11613 kind = PyUnicode_KIND(self);
11614 data = PyUnicode_DATA(self);
11615 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011616
Benjamin Peterson14339b62009-01-31 16:36:08 +000011617 i = 0;
11618 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011619 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011620 i++;
11621 }
11622 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011623
Benjamin Peterson14339b62009-01-31 16:36:08 +000011624 j = len;
11625 if (striptype != LEFTSTRIP) {
11626 do {
11627 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011628 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011629 j++;
11630 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011631
Victor Stinner7931d9a2011-11-04 00:22:48 +010011632 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011633}
11634
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011635
11636static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011637do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011638{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011639 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011640
Benjamin Peterson14339b62009-01-31 16:36:08 +000011641 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11642 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011643
Benjamin Peterson14339b62009-01-31 16:36:08 +000011644 if (sep != NULL && sep != Py_None) {
11645 if (PyUnicode_Check(sep))
11646 return _PyUnicode_XStrip(self, striptype, sep);
11647 else {
11648 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011649 "%s arg must be None or str",
11650 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011651 return NULL;
11652 }
11653 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011654
Benjamin Peterson14339b62009-01-31 16:36:08 +000011655 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011656}
11657
11658
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011659PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011660 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011661\n\
11662Return a copy of the string S with leading and trailing\n\
11663whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011664If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011665
11666static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011667unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011668{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011669 if (PyTuple_GET_SIZE(args) == 0)
11670 return do_strip(self, BOTHSTRIP); /* Common case */
11671 else
11672 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011673}
11674
11675
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011676PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011677 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011678\n\
11679Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011680If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011681
11682static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011683unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011684{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011685 if (PyTuple_GET_SIZE(args) == 0)
11686 return do_strip(self, LEFTSTRIP); /* Common case */
11687 else
11688 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011689}
11690
11691
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011692PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011693 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011694\n\
11695Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011696If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011697
11698static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011699unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011700{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011701 if (PyTuple_GET_SIZE(args) == 0)
11702 return do_strip(self, RIGHTSTRIP); /* Common case */
11703 else
11704 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011705}
11706
11707
Guido van Rossumd57fd912000-03-10 22:53:23 +000011708static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011709unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011710{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011711 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011712 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011713
Georg Brandl222de0f2009-04-12 12:01:50 +000011714 if (len < 1) {
11715 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011716 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011717 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011718
Victor Stinnerc4b49542011-12-11 22:44:26 +010011719 /* no repeat, return original string */
11720 if (len == 1)
11721 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000011722
Victor Stinnerc4b49542011-12-11 22:44:26 +010011723 if (PyUnicode_READY(str) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011724 return NULL;
11725
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011726 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011727 PyErr_SetString(PyExc_OverflowError,
11728 "repeated string is too long");
11729 return NULL;
11730 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011731 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011732
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011733 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011734 if (!u)
11735 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011736 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011737
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011738 if (PyUnicode_GET_LENGTH(str) == 1) {
11739 const int kind = PyUnicode_KIND(str);
11740 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11741 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011742 if (kind == PyUnicode_1BYTE_KIND)
11743 memset(to, (unsigned char)fill_char, len);
11744 else {
11745 for (n = 0; n < len; ++n)
11746 PyUnicode_WRITE(kind, to, n, fill_char);
11747 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011748 }
11749 else {
11750 /* number of characters copied this far */
11751 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011752 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011753 char *to = (char *) PyUnicode_DATA(u);
11754 Py_MEMCPY(to, PyUnicode_DATA(str),
11755 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011756 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011757 n = (done <= nchars-done) ? done : nchars-done;
11758 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011759 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011760 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011761 }
11762
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011763 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011764 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011765}
11766
Alexander Belopolsky40018472011-02-26 01:02:56 +000011767PyObject *
11768PyUnicode_Replace(PyObject *obj,
11769 PyObject *subobj,
11770 PyObject *replobj,
11771 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011772{
11773 PyObject *self;
11774 PyObject *str1;
11775 PyObject *str2;
11776 PyObject *result;
11777
11778 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011779 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011780 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011781 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011782 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011783 Py_DECREF(self);
11784 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011785 }
11786 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011787 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011788 Py_DECREF(self);
11789 Py_DECREF(str1);
11790 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011791 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011792 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011793 Py_DECREF(self);
11794 Py_DECREF(str1);
11795 Py_DECREF(str2);
11796 return result;
11797}
11798
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011799PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011800 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011801\n\
11802Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011803old replaced by new. If the optional argument count is\n\
11804given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011805
11806static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011807unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011808{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011809 PyObject *str1;
11810 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011811 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011812 PyObject *result;
11813
Martin v. Löwis18e16552006-02-15 17:27:45 +000011814 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011815 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011816 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011817 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011818 str1 = PyUnicode_FromObject(str1);
11819 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11820 return NULL;
11821 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011822 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011823 Py_DECREF(str1);
11824 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011825 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011826
11827 result = replace(self, str1, str2, maxcount);
11828
11829 Py_DECREF(str1);
11830 Py_DECREF(str2);
11831 return result;
11832}
11833
Alexander Belopolsky40018472011-02-26 01:02:56 +000011834static PyObject *
11835unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011836{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011837 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011838 Py_ssize_t isize;
11839 Py_ssize_t osize, squote, dquote, i, o;
11840 Py_UCS4 max, quote;
11841 int ikind, okind;
11842 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011843
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011844 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011845 return NULL;
11846
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011847 isize = PyUnicode_GET_LENGTH(unicode);
11848 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011849
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011850 /* Compute length of output, quote characters, and
11851 maximum character */
11852 osize = 2; /* quotes */
11853 max = 127;
11854 squote = dquote = 0;
11855 ikind = PyUnicode_KIND(unicode);
11856 for (i = 0; i < isize; i++) {
11857 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11858 switch (ch) {
11859 case '\'': squote++; osize++; break;
11860 case '"': dquote++; osize++; break;
11861 case '\\': case '\t': case '\r': case '\n':
11862 osize += 2; break;
11863 default:
11864 /* Fast-path ASCII */
11865 if (ch < ' ' || ch == 0x7f)
11866 osize += 4; /* \xHH */
11867 else if (ch < 0x7f)
11868 osize++;
11869 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11870 osize++;
11871 max = ch > max ? ch : max;
11872 }
11873 else if (ch < 0x100)
11874 osize += 4; /* \xHH */
11875 else if (ch < 0x10000)
11876 osize += 6; /* \uHHHH */
11877 else
11878 osize += 10; /* \uHHHHHHHH */
11879 }
11880 }
11881
11882 quote = '\'';
11883 if (squote) {
11884 if (dquote)
11885 /* Both squote and dquote present. Use squote,
11886 and escape them */
11887 osize += squote;
11888 else
11889 quote = '"';
11890 }
11891
11892 repr = PyUnicode_New(osize, max);
11893 if (repr == NULL)
11894 return NULL;
11895 okind = PyUnicode_KIND(repr);
11896 odata = PyUnicode_DATA(repr);
11897
11898 PyUnicode_WRITE(okind, odata, 0, quote);
11899 PyUnicode_WRITE(okind, odata, osize-1, quote);
11900
11901 for (i = 0, o = 1; i < isize; i++) {
11902 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011903
11904 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011905 if ((ch == quote) || (ch == '\\')) {
11906 PyUnicode_WRITE(okind, odata, o++, '\\');
11907 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011908 continue;
11909 }
11910
Benjamin Peterson29060642009-01-31 22:14:21 +000011911 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011912 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011913 PyUnicode_WRITE(okind, odata, o++, '\\');
11914 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011915 }
11916 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011917 PyUnicode_WRITE(okind, odata, o++, '\\');
11918 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011919 }
11920 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011921 PyUnicode_WRITE(okind, odata, o++, '\\');
11922 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011923 }
11924
11925 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011926 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011927 PyUnicode_WRITE(okind, odata, o++, '\\');
11928 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011929 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
11930 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011931 }
11932
Georg Brandl559e5d72008-06-11 18:37:52 +000011933 /* Copy ASCII characters as-is */
11934 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011935 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011936 }
11937
Benjamin Peterson29060642009-01-31 22:14:21 +000011938 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011939 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011940 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011941 (categories Z* and C* except ASCII space)
11942 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011943 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011944 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011945 if (ch <= 0xff) {
11946 PyUnicode_WRITE(okind, odata, o++, '\\');
11947 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011948 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
11949 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011950 }
11951 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011952 else if (ch >= 0x10000) {
11953 PyUnicode_WRITE(okind, odata, o++, '\\');
11954 PyUnicode_WRITE(okind, odata, o++, 'U');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011955 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
11956 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
11957 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
11958 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
11959 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
11960 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
11961 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
11962 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011963 }
11964 /* Map 16-bit characters to '\uxxxx' */
11965 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011966 PyUnicode_WRITE(okind, odata, o++, '\\');
11967 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011968 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
11969 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
11970 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
11971 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011972 }
11973 }
11974 /* Copy characters as-is */
11975 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011976 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011977 }
11978 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000011979 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011980 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020011981 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000011982 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011983}
11984
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011985PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011986 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011987\n\
11988Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011989such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011990arguments start and end are interpreted as in slice notation.\n\
11991\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011992Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011993
11994static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011995unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011996{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011997 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011998 Py_ssize_t start;
11999 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012000 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012001
Jesus Ceaac451502011-04-20 17:09:23 +020012002 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12003 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012004 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012005
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012006 if (PyUnicode_READY(self) == -1)
12007 return NULL;
12008 if (PyUnicode_READY(substring) == -1)
12009 return NULL;
12010
Victor Stinner7931d9a2011-11-04 00:22:48 +010012011 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012012
12013 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012014
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012015 if (result == -2)
12016 return NULL;
12017
Christian Heimes217cfd12007-12-02 14:31:20 +000012018 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012019}
12020
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012021PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012022 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012023\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012024Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012025
12026static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012027unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012028{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012029 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012030 Py_ssize_t start;
12031 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012032 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012033
Jesus Ceaac451502011-04-20 17:09:23 +020012034 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12035 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012036 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012037
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012038 if (PyUnicode_READY(self) == -1)
12039 return NULL;
12040 if (PyUnicode_READY(substring) == -1)
12041 return NULL;
12042
Victor Stinner7931d9a2011-11-04 00:22:48 +010012043 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012044
12045 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012046
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012047 if (result == -2)
12048 return NULL;
12049
Guido van Rossumd57fd912000-03-10 22:53:23 +000012050 if (result < 0) {
12051 PyErr_SetString(PyExc_ValueError, "substring not found");
12052 return NULL;
12053 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012054
Christian Heimes217cfd12007-12-02 14:31:20 +000012055 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012056}
12057
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012058PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012059 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012060\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012061Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012062done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012063
12064static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012065unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012066{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012067 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012068 Py_UCS4 fillchar = ' ';
12069
Victor Stinnere9a29352011-10-01 02:14:59 +020012070 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012071 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012072
Victor Stinnerc4b49542011-12-11 22:44:26 +010012073 if (PyUnicode_READY(self) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012074 return NULL;
12075
Victor Stinnerc4b49542011-12-11 22:44:26 +010012076 if (PyUnicode_GET_LENGTH(self) >= width)
12077 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012078
Victor Stinnerc4b49542011-12-11 22:44:26 +010012079 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012080}
12081
Alexander Belopolsky40018472011-02-26 01:02:56 +000012082PyObject *
12083PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012084{
12085 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012086
Guido van Rossumd57fd912000-03-10 22:53:23 +000012087 s = PyUnicode_FromObject(s);
12088 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012089 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012090 if (sep != NULL) {
12091 sep = PyUnicode_FromObject(sep);
12092 if (sep == NULL) {
12093 Py_DECREF(s);
12094 return NULL;
12095 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012096 }
12097
Victor Stinner9310abb2011-10-05 00:59:23 +020012098 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012099
12100 Py_DECREF(s);
12101 Py_XDECREF(sep);
12102 return result;
12103}
12104
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012105PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012106 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012107\n\
12108Return a list of the words in S, using sep as the\n\
12109delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012110splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012111whitespace string is a separator and empty strings are\n\
12112removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012113
12114static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012115unicode_split(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012116{
12117 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012118 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012119
Martin v. Löwis18e16552006-02-15 17:27:45 +000012120 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012121 return NULL;
12122
12123 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012124 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012125 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012126 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012127 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012128 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012129}
12130
Thomas Wouters477c8d52006-05-27 19:21:47 +000012131PyObject *
12132PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12133{
12134 PyObject* str_obj;
12135 PyObject* sep_obj;
12136 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012137 int kind1, kind2, kind;
12138 void *buf1 = NULL, *buf2 = NULL;
12139 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012140
12141 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020012142 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012143 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012144 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012145 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012146 Py_DECREF(str_obj);
12147 return NULL;
12148 }
12149
Victor Stinner14f8f022011-10-05 20:58:25 +020012150 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012151 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012152 kind = Py_MAX(kind1, kind2);
12153 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012154 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012155 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012156 if (!buf1)
12157 goto onError;
12158 buf2 = PyUnicode_DATA(sep_obj);
12159 if (kind2 != kind)
12160 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12161 if (!buf2)
12162 goto onError;
12163 len1 = PyUnicode_GET_LENGTH(str_obj);
12164 len2 = PyUnicode_GET_LENGTH(sep_obj);
12165
Victor Stinner14f8f022011-10-05 20:58:25 +020012166 switch(PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012167 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012168 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12169 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12170 else
12171 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012172 break;
12173 case PyUnicode_2BYTE_KIND:
12174 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12175 break;
12176 case PyUnicode_4BYTE_KIND:
12177 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12178 break;
12179 default:
12180 assert(0);
12181 out = 0;
12182 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012183
12184 Py_DECREF(sep_obj);
12185 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012186 if (kind1 != kind)
12187 PyMem_Free(buf1);
12188 if (kind2 != kind)
12189 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012190
12191 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012192 onError:
12193 Py_DECREF(sep_obj);
12194 Py_DECREF(str_obj);
12195 if (kind1 != kind && buf1)
12196 PyMem_Free(buf1);
12197 if (kind2 != kind && buf2)
12198 PyMem_Free(buf2);
12199 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012200}
12201
12202
12203PyObject *
12204PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12205{
12206 PyObject* str_obj;
12207 PyObject* sep_obj;
12208 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012209 int kind1, kind2, kind;
12210 void *buf1 = NULL, *buf2 = NULL;
12211 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012212
12213 str_obj = PyUnicode_FromObject(str_in);
12214 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012215 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012216 sep_obj = PyUnicode_FromObject(sep_in);
12217 if (!sep_obj) {
12218 Py_DECREF(str_obj);
12219 return NULL;
12220 }
12221
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012222 kind1 = PyUnicode_KIND(str_in);
12223 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012224 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012225 buf1 = PyUnicode_DATA(str_in);
12226 if (kind1 != kind)
12227 buf1 = _PyUnicode_AsKind(str_in, kind);
12228 if (!buf1)
12229 goto onError;
12230 buf2 = PyUnicode_DATA(sep_obj);
12231 if (kind2 != kind)
12232 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12233 if (!buf2)
12234 goto onError;
12235 len1 = PyUnicode_GET_LENGTH(str_obj);
12236 len2 = PyUnicode_GET_LENGTH(sep_obj);
12237
12238 switch(PyUnicode_KIND(str_in)) {
12239 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012240 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12241 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12242 else
12243 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012244 break;
12245 case PyUnicode_2BYTE_KIND:
12246 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12247 break;
12248 case PyUnicode_4BYTE_KIND:
12249 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12250 break;
12251 default:
12252 assert(0);
12253 out = 0;
12254 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012255
12256 Py_DECREF(sep_obj);
12257 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012258 if (kind1 != kind)
12259 PyMem_Free(buf1);
12260 if (kind2 != kind)
12261 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012262
12263 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012264 onError:
12265 Py_DECREF(sep_obj);
12266 Py_DECREF(str_obj);
12267 if (kind1 != kind && buf1)
12268 PyMem_Free(buf1);
12269 if (kind2 != kind && buf2)
12270 PyMem_Free(buf2);
12271 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012272}
12273
12274PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012275 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012276\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012277Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012278the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012279found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012280
12281static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012282unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012283{
Victor Stinner9310abb2011-10-05 00:59:23 +020012284 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012285}
12286
12287PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012288 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012289\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012290Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012291the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012292separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012293
12294static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012295unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012296{
Victor Stinner9310abb2011-10-05 00:59:23 +020012297 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012298}
12299
Alexander Belopolsky40018472011-02-26 01:02:56 +000012300PyObject *
12301PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012302{
12303 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012304
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012305 s = PyUnicode_FromObject(s);
12306 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012307 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012308 if (sep != NULL) {
12309 sep = PyUnicode_FromObject(sep);
12310 if (sep == NULL) {
12311 Py_DECREF(s);
12312 return NULL;
12313 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012314 }
12315
Victor Stinner9310abb2011-10-05 00:59:23 +020012316 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012317
12318 Py_DECREF(s);
12319 Py_XDECREF(sep);
12320 return result;
12321}
12322
12323PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012324 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012325\n\
12326Return a list of the words in S, using sep as the\n\
12327delimiter string, starting at the end of the string and\n\
12328working to the front. If maxsplit is given, at most maxsplit\n\
12329splits are done. If sep is not specified, any whitespace string\n\
12330is a separator.");
12331
12332static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012333unicode_rsplit(PyObject *self, PyObject *args)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012334{
12335 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012336 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012337
Martin v. Löwis18e16552006-02-15 17:27:45 +000012338 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012339 return NULL;
12340
12341 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012342 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012343 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012344 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012345 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012346 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012347}
12348
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012349PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012350 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012351\n\
12352Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012353Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012354is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012355
12356static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012357unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012358{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012359 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012360 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012361
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012362 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12363 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012364 return NULL;
12365
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012366 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012367}
12368
12369static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012370PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012371{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012372 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012373}
12374
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012375PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012376 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012377\n\
12378Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012379and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012380
12381static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012382unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012383{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012384 return fixup(self, fixswapcase);
12385}
12386
Georg Brandlceee0772007-11-27 23:48:05 +000012387PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012388 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012389\n\
12390Return a translation table usable for str.translate().\n\
12391If there is only one argument, it must be a dictionary mapping Unicode\n\
12392ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012393Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012394If there are two arguments, they must be strings of equal length, and\n\
12395in the resulting dictionary, each character in x will be mapped to the\n\
12396character at the same position in y. If there is a third argument, it\n\
12397must be a string, whose characters will be mapped to None in the result.");
12398
12399static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012400unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012401{
12402 PyObject *x, *y = NULL, *z = NULL;
12403 PyObject *new = NULL, *key, *value;
12404 Py_ssize_t i = 0;
12405 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012406
Georg Brandlceee0772007-11-27 23:48:05 +000012407 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12408 return NULL;
12409 new = PyDict_New();
12410 if (!new)
12411 return NULL;
12412 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012413 int x_kind, y_kind, z_kind;
12414 void *x_data, *y_data, *z_data;
12415
Georg Brandlceee0772007-11-27 23:48:05 +000012416 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012417 if (!PyUnicode_Check(x)) {
12418 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12419 "be a string if there is a second argument");
12420 goto err;
12421 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012422 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012423 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12424 "arguments must have equal length");
12425 goto err;
12426 }
12427 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012428 x_kind = PyUnicode_KIND(x);
12429 y_kind = PyUnicode_KIND(y);
12430 x_data = PyUnicode_DATA(x);
12431 y_data = PyUnicode_DATA(y);
12432 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12433 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12434 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012435 if (!key || !value)
12436 goto err;
12437 res = PyDict_SetItem(new, key, value);
12438 Py_DECREF(key);
12439 Py_DECREF(value);
12440 if (res < 0)
12441 goto err;
12442 }
12443 /* create entries for deleting chars in z */
12444 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012445 z_kind = PyUnicode_KIND(z);
12446 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012447 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012448 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012449 if (!key)
12450 goto err;
12451 res = PyDict_SetItem(new, key, Py_None);
12452 Py_DECREF(key);
12453 if (res < 0)
12454 goto err;
12455 }
12456 }
12457 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012458 int kind;
12459 void *data;
12460
Georg Brandlceee0772007-11-27 23:48:05 +000012461 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012462 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012463 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12464 "to maketrans it must be a dict");
12465 goto err;
12466 }
12467 /* copy entries into the new dict, converting string keys to int keys */
12468 while (PyDict_Next(x, &i, &key, &value)) {
12469 if (PyUnicode_Check(key)) {
12470 /* convert string keys to integer keys */
12471 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012472 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012473 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12474 "table must be of length 1");
12475 goto err;
12476 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012477 kind = PyUnicode_KIND(key);
12478 data = PyUnicode_DATA(key);
12479 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012480 if (!newkey)
12481 goto err;
12482 res = PyDict_SetItem(new, newkey, value);
12483 Py_DECREF(newkey);
12484 if (res < 0)
12485 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012486 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012487 /* just keep integer keys */
12488 if (PyDict_SetItem(new, key, value) < 0)
12489 goto err;
12490 } else {
12491 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12492 "be strings or integers");
12493 goto err;
12494 }
12495 }
12496 }
12497 return new;
12498 err:
12499 Py_DECREF(new);
12500 return NULL;
12501}
12502
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012503PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012504 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012505\n\
12506Return a copy of the string S, where all characters have been mapped\n\
12507through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012508Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012509Unmapped characters are left untouched. Characters mapped to None\n\
12510are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012511
12512static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012513unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012514{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012515 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012516}
12517
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012518PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012519 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012520\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012521Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012522
12523static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012524unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012525{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012526 return fixup(self, fixupper);
12527}
12528
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012529PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012530 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012531\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012532Pad a numeric string S with zeros on the left, to fill a field\n\
12533of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012534
12535static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012536unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012537{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012538 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012539 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012540 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012541 int kind;
12542 void *data;
12543 Py_UCS4 chr;
12544
Martin v. Löwis18e16552006-02-15 17:27:45 +000012545 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012546 return NULL;
12547
Victor Stinnerc4b49542011-12-11 22:44:26 +010012548 if (PyUnicode_READY(self) < 0)
12549 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012550
Victor Stinnerc4b49542011-12-11 22:44:26 +010012551 if (PyUnicode_GET_LENGTH(self) >= width)
12552 return unicode_result_unchanged(self);
12553
12554 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012555
12556 u = pad(self, fill, 0, '0');
12557
Walter Dörwald068325e2002-04-15 13:36:47 +000012558 if (u == NULL)
12559 return NULL;
12560
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012561 kind = PyUnicode_KIND(u);
12562 data = PyUnicode_DATA(u);
12563 chr = PyUnicode_READ(kind, data, fill);
12564
12565 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012566 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012567 PyUnicode_WRITE(kind, data, 0, chr);
12568 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012569 }
12570
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012571 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012572 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012573}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012574
12575#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012576static PyObject *
12577unicode__decimal2ascii(PyObject *self)
12578{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012579 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012580}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012581#endif
12582
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012583PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012584 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012585\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012586Return True if S starts with the specified prefix, False otherwise.\n\
12587With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012588With optional end, stop comparing S at that position.\n\
12589prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012590
12591static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012592unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012593 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012594{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012595 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012596 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012597 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012598 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012599 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012600
Jesus Ceaac451502011-04-20 17:09:23 +020012601 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012602 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012603 if (PyTuple_Check(subobj)) {
12604 Py_ssize_t i;
12605 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012606 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012607 if (substring == NULL)
12608 return NULL;
12609 result = tailmatch(self, substring, start, end, -1);
12610 Py_DECREF(substring);
12611 if (result) {
12612 Py_RETURN_TRUE;
12613 }
12614 }
12615 /* nothing matched */
12616 Py_RETURN_FALSE;
12617 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012618 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012619 if (substring == NULL) {
12620 if (PyErr_ExceptionMatches(PyExc_TypeError))
12621 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12622 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012623 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012624 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012625 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012626 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012627 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012628}
12629
12630
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012631PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012632 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012633\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012634Return True if S ends with the specified suffix, False otherwise.\n\
12635With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012636With optional end, stop comparing S at that position.\n\
12637suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012638
12639static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012640unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012641 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012642{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012643 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012644 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012645 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012646 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012647 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012648
Jesus Ceaac451502011-04-20 17:09:23 +020012649 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012650 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012651 if (PyTuple_Check(subobj)) {
12652 Py_ssize_t i;
12653 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012654 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012655 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012656 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012657 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012658 result = tailmatch(self, substring, start, end, +1);
12659 Py_DECREF(substring);
12660 if (result) {
12661 Py_RETURN_TRUE;
12662 }
12663 }
12664 Py_RETURN_FALSE;
12665 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012666 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012667 if (substring == NULL) {
12668 if (PyErr_ExceptionMatches(PyExc_TypeError))
12669 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12670 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012671 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012672 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012673 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012674 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012675 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012676}
12677
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012678#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012679
12680PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012681 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012682\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012683Return a formatted version of S, using substitutions from args and kwargs.\n\
12684The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012685
Eric Smith27bbca62010-11-04 17:06:58 +000012686PyDoc_STRVAR(format_map__doc__,
12687 "S.format_map(mapping) -> str\n\
12688\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012689Return a formatted version of S, using substitutions from mapping.\n\
12690The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012691
Eric Smith4a7d76d2008-05-30 18:10:19 +000012692static PyObject *
12693unicode__format__(PyObject* self, PyObject* args)
12694{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012695 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012696
12697 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12698 return NULL;
12699
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012700 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012701 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012702 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012703}
12704
Eric Smith8c663262007-08-25 02:26:07 +000012705PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012706 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012707\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012708Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012709
12710static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012711unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012712{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012713 Py_ssize_t size;
12714
12715 /* If it's a compact object, account for base structure +
12716 character data. */
12717 if (PyUnicode_IS_COMPACT_ASCII(v))
12718 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12719 else if (PyUnicode_IS_COMPACT(v))
12720 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012721 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012722 else {
12723 /* If it is a two-block object, account for base object, and
12724 for character block if present. */
12725 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012726 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012727 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012728 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012729 }
12730 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012731 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012732 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012733 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012734 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012735 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012736
12737 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012738}
12739
12740PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012741 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012742
12743static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012744unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012745{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010012746 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012747 if (!copy)
12748 return NULL;
12749 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012750}
12751
Guido van Rossumd57fd912000-03-10 22:53:23 +000012752static PyMethodDef unicode_methods[] = {
12753
12754 /* Order is according to common usage: often used methods should
12755 appear first, since lookup is done sequentially. */
12756
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012757 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012758 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12759 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012760 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012761 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12762 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12763 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12764 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12765 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12766 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12767 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012768 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012769 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12770 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12771 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012772 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012773 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12774 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12775 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012776 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012777 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012778 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012779 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012780 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12781 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12782 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12783 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12784 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12785 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12786 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12787 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12788 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12789 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12790 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12791 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12792 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12793 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012794 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012795 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012796 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012797 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012798 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012799 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012800 {"maketrans", (PyCFunction) unicode_maketrans,
12801 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012802 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012803#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012804 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012805#endif
12806
12807#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012808 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012809 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012810#endif
12811
Benjamin Peterson14339b62009-01-31 16:36:08 +000012812 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012813 {NULL, NULL}
12814};
12815
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012816static PyObject *
12817unicode_mod(PyObject *v, PyObject *w)
12818{
Brian Curtindfc80e32011-08-10 20:28:54 -050012819 if (!PyUnicode_Check(v))
12820 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012821 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012822}
12823
12824static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012825 0, /*nb_add*/
12826 0, /*nb_subtract*/
12827 0, /*nb_multiply*/
12828 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012829};
12830
Guido van Rossumd57fd912000-03-10 22:53:23 +000012831static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012832 (lenfunc) unicode_length, /* sq_length */
12833 PyUnicode_Concat, /* sq_concat */
12834 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12835 (ssizeargfunc) unicode_getitem, /* sq_item */
12836 0, /* sq_slice */
12837 0, /* sq_ass_item */
12838 0, /* sq_ass_slice */
12839 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012840};
12841
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012842static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012843unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012844{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012845 if (PyUnicode_READY(self) == -1)
12846 return NULL;
12847
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000012848 if (PyIndex_Check(item)) {
12849 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012850 if (i == -1 && PyErr_Occurred())
12851 return NULL;
12852 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012853 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012854 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012855 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000012856 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012857 PyObject *result;
12858 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012859 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012860 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012861
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012862 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000012863 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012864 return NULL;
12865 }
12866
12867 if (slicelength <= 0) {
Victor Stinner382955f2011-12-11 21:44:00 +010012868 Py_INCREF(unicode_empty);
12869 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012870 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010012871 slicelength == PyUnicode_GET_LENGTH(self)) {
12872 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000012873 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012874 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020012875 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012876 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012877 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012878 src_kind = PyUnicode_KIND(self);
12879 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020012880 if (!PyUnicode_IS_ASCII(self)) {
12881 kind_limit = kind_maxchar_limit(src_kind);
12882 max_char = 0;
12883 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12884 ch = PyUnicode_READ(src_kind, src_data, cur);
12885 if (ch > max_char) {
12886 max_char = ch;
12887 if (max_char >= kind_limit)
12888 break;
12889 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012890 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012891 }
Victor Stinner55c99112011-10-13 01:17:06 +020012892 else
12893 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012894 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012895 if (result == NULL)
12896 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012897 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012898 dest_data = PyUnicode_DATA(result);
12899
12900 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012901 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
12902 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012903 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012904 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012905 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012906 } else {
12907 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12908 return NULL;
12909 }
12910}
12911
12912static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012913 (lenfunc)unicode_length, /* mp_length */
12914 (binaryfunc)unicode_subscript, /* mp_subscript */
12915 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012916};
12917
Guido van Rossumd57fd912000-03-10 22:53:23 +000012918
Guido van Rossumd57fd912000-03-10 22:53:23 +000012919/* Helpers for PyUnicode_Format() */
12920
12921static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000012922getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012923{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012924 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012925 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012926 (*p_argidx)++;
12927 if (arglen < 0)
12928 return args;
12929 else
12930 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012931 }
12932 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012933 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012934 return NULL;
12935}
12936
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012937/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012938
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012939static PyObject *
12940formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012941{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012942 char *p;
12943 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012944 double x;
Tim Petersced69f82003-09-16 20:30:58 +000012945
Guido van Rossumd57fd912000-03-10 22:53:23 +000012946 x = PyFloat_AsDouble(v);
12947 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012948 return NULL;
12949
Guido van Rossumd57fd912000-03-10 22:53:23 +000012950 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012951 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000012952
Eric Smith0923d1d2009-04-16 20:16:10 +000012953 p = PyOS_double_to_string(x, type, prec,
12954 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012955 if (p == NULL)
12956 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012957 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000012958 PyMem_Free(p);
12959 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012960}
12961
Tim Peters38fd5b62000-09-21 05:43:11 +000012962static PyObject*
12963formatlong(PyObject *val, int flags, int prec, int type)
12964{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012965 char *buf;
12966 int len;
12967 PyObject *str; /* temporary string object. */
12968 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012969
Benjamin Peterson14339b62009-01-31 16:36:08 +000012970 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
12971 if (!str)
12972 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012973 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012974 Py_DECREF(str);
12975 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012976}
12977
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012978static Py_UCS4
12979formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012980{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012981 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012982 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012983 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012984 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000012985 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012986 goto onError;
12987 }
12988 else {
12989 /* Integer input truncated to a character */
12990 long x;
12991 x = PyLong_AsLong(v);
12992 if (x == -1 && PyErr_Occurred())
12993 goto onError;
12994
Victor Stinner8faf8212011-12-08 22:14:11 +010012995 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012996 PyErr_SetString(PyExc_OverflowError,
12997 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012998 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012999 }
13000
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013001 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013002 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013003
Benjamin Peterson29060642009-01-31 22:14:21 +000013004 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013005 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013006 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013007 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013008}
13009
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013010static int
13011repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count)
13012{
13013 int r;
13014 assert(count > 0);
13015 assert(PyUnicode_Check(obj));
13016 if (count > 5) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013017 PyObject *repeated = unicode_repeat(obj, count);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013018 if (repeated == NULL)
13019 return -1;
13020 r = _PyAccu_Accumulate(acc, repeated);
13021 Py_DECREF(repeated);
13022 return r;
13023 }
13024 else {
13025 do {
13026 if (_PyAccu_Accumulate(acc, obj))
13027 return -1;
13028 } while (--count);
13029 return 0;
13030 }
13031}
13032
Alexander Belopolsky40018472011-02-26 01:02:56 +000013033PyObject *
13034PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013035{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013036 void *fmt;
13037 int fmtkind;
13038 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013039 int kind;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013040 int r;
13041 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013042 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013043 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013044 PyObject *temp = NULL;
13045 PyObject *second = NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013046 PyObject *uformat;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013047 _PyAccu acc;
13048 static PyObject *plus, *minus, *blank, *zero, *percent;
13049
13050 if (!plus && !(plus = get_latin1_char('+')))
13051 return NULL;
13052 if (!minus && !(minus = get_latin1_char('-')))
13053 return NULL;
13054 if (!blank && !(blank = get_latin1_char(' ')))
13055 return NULL;
13056 if (!zero && !(zero = get_latin1_char('0')))
13057 return NULL;
13058 if (!percent && !(percent = get_latin1_char('%')))
13059 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000013060
Guido van Rossumd57fd912000-03-10 22:53:23 +000013061 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013062 PyErr_BadInternalCall();
13063 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013064 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013065 uformat = PyUnicode_FromObject(format);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013066 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013067 return NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013068 if (_PyAccu_Init(&acc))
13069 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013070 fmt = PyUnicode_DATA(uformat);
13071 fmtkind = PyUnicode_KIND(uformat);
13072 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13073 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013074
Guido van Rossumd57fd912000-03-10 22:53:23 +000013075 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013076 arglen = PyTuple_Size(args);
13077 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013078 }
13079 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013080 arglen = -1;
13081 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013082 }
Christian Heimes90aa7642007-12-19 02:45:37 +000013083 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000013084 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013085 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013086
13087 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013088 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013089 PyObject *nonfmt;
13090 Py_ssize_t nonfmtpos;
13091 nonfmtpos = fmtpos++;
13092 while (fmtcnt >= 0 &&
13093 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13094 fmtpos++;
13095 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013096 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013097 nonfmt = PyUnicode_Substring(uformat, nonfmtpos, fmtpos);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013098 if (nonfmt == NULL)
13099 goto onError;
13100 r = _PyAccu_Accumulate(&acc, nonfmt);
13101 Py_DECREF(nonfmt);
13102 if (r)
13103 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013104 }
13105 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013106 /* Got a format specifier */
13107 int flags = 0;
13108 Py_ssize_t width = -1;
13109 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013110 Py_UCS4 c = '\0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013111 Py_UCS4 fill, sign;
Benjamin Peterson29060642009-01-31 22:14:21 +000013112 int isnumok;
13113 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013114 void *pbuf = NULL;
13115 Py_ssize_t pindex, len;
13116 PyObject *signobj = NULL, *fillobj = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013117
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013118 fmtpos++;
13119 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
13120 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013121 Py_ssize_t keylen;
13122 PyObject *key;
13123 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013124
Benjamin Peterson29060642009-01-31 22:14:21 +000013125 if (dict == NULL) {
13126 PyErr_SetString(PyExc_TypeError,
13127 "format requires a mapping");
13128 goto onError;
13129 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013130 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013131 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013132 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013133 /* Skip over balanced parentheses */
13134 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013135 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013136 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013137 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013138 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013139 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013140 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013141 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013142 if (fmtcnt < 0 || pcount > 0) {
13143 PyErr_SetString(PyExc_ValueError,
13144 "incomplete format key");
13145 goto onError;
13146 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013147 key = PyUnicode_Substring(uformat,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013148 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013149 if (key == NULL)
13150 goto onError;
13151 if (args_owned) {
13152 Py_DECREF(args);
13153 args_owned = 0;
13154 }
13155 args = PyObject_GetItem(dict, key);
13156 Py_DECREF(key);
13157 if (args == NULL) {
13158 goto onError;
13159 }
13160 args_owned = 1;
13161 arglen = -1;
13162 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013163 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013164 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013165 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013166 case '-': flags |= F_LJUST; continue;
13167 case '+': flags |= F_SIGN; continue;
13168 case ' ': flags |= F_BLANK; continue;
13169 case '#': flags |= F_ALT; continue;
13170 case '0': flags |= F_ZERO; continue;
13171 }
13172 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013173 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013174 if (c == '*') {
13175 v = getnextarg(args, arglen, &argidx);
13176 if (v == NULL)
13177 goto onError;
13178 if (!PyLong_Check(v)) {
13179 PyErr_SetString(PyExc_TypeError,
13180 "* wants int");
13181 goto onError;
13182 }
13183 width = PyLong_AsLong(v);
13184 if (width == -1 && PyErr_Occurred())
13185 goto onError;
13186 if (width < 0) {
13187 flags |= F_LJUST;
13188 width = -width;
13189 }
13190 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013191 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013192 }
13193 else if (c >= '0' && c <= '9') {
13194 width = c - '0';
13195 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013196 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013197 if (c < '0' || c > '9')
13198 break;
13199 if ((width*10) / 10 != width) {
13200 PyErr_SetString(PyExc_ValueError,
13201 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013202 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013203 }
13204 width = width*10 + (c - '0');
13205 }
13206 }
13207 if (c == '.') {
13208 prec = 0;
13209 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013210 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013211 if (c == '*') {
13212 v = getnextarg(args, arglen, &argidx);
13213 if (v == NULL)
13214 goto onError;
13215 if (!PyLong_Check(v)) {
13216 PyErr_SetString(PyExc_TypeError,
13217 "* wants int");
13218 goto onError;
13219 }
13220 prec = PyLong_AsLong(v);
13221 if (prec == -1 && PyErr_Occurred())
13222 goto onError;
13223 if (prec < 0)
13224 prec = 0;
13225 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013226 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013227 }
13228 else if (c >= '0' && c <= '9') {
13229 prec = c - '0';
13230 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013231 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013232 if (c < '0' || c > '9')
13233 break;
13234 if ((prec*10) / 10 != prec) {
13235 PyErr_SetString(PyExc_ValueError,
13236 "prec too big");
13237 goto onError;
13238 }
13239 prec = prec*10 + (c - '0');
13240 }
13241 }
13242 } /* prec */
13243 if (fmtcnt >= 0) {
13244 if (c == 'h' || c == 'l' || c == 'L') {
13245 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013246 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013247 }
13248 }
13249 if (fmtcnt < 0) {
13250 PyErr_SetString(PyExc_ValueError,
13251 "incomplete format");
13252 goto onError;
13253 }
13254 if (c != '%') {
13255 v = getnextarg(args, arglen, &argidx);
13256 if (v == NULL)
13257 goto onError;
13258 }
13259 sign = 0;
13260 fill = ' ';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013261 fillobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013262 switch (c) {
13263
13264 case '%':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013265 _PyAccu_Accumulate(&acc, percent);
13266 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013267
13268 case 's':
13269 case 'r':
13270 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000013271 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013272 temp = v;
13273 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013274 }
13275 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013276 if (c == 's')
13277 temp = PyObject_Str(v);
13278 else if (c == 'r')
13279 temp = PyObject_Repr(v);
13280 else
13281 temp = PyObject_ASCII(v);
13282 if (temp == NULL)
13283 goto onError;
13284 if (PyUnicode_Check(temp))
13285 /* nothing to do */;
13286 else {
13287 Py_DECREF(temp);
13288 PyErr_SetString(PyExc_TypeError,
13289 "%s argument has non-string str()");
13290 goto onError;
13291 }
13292 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013293 if (PyUnicode_READY(temp) == -1) {
13294 Py_CLEAR(temp);
13295 goto onError;
13296 }
13297 pbuf = PyUnicode_DATA(temp);
13298 kind = PyUnicode_KIND(temp);
13299 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013300 if (prec >= 0 && len > prec)
13301 len = prec;
13302 break;
13303
13304 case 'i':
13305 case 'd':
13306 case 'u':
13307 case 'o':
13308 case 'x':
13309 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000013310 isnumok = 0;
13311 if (PyNumber_Check(v)) {
13312 PyObject *iobj=NULL;
13313
13314 if (PyLong_Check(v)) {
13315 iobj = v;
13316 Py_INCREF(iobj);
13317 }
13318 else {
13319 iobj = PyNumber_Long(v);
13320 }
13321 if (iobj!=NULL) {
13322 if (PyLong_Check(iobj)) {
13323 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013324 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013325 Py_DECREF(iobj);
13326 if (!temp)
13327 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013328 if (PyUnicode_READY(temp) == -1) {
13329 Py_CLEAR(temp);
13330 goto onError;
13331 }
13332 pbuf = PyUnicode_DATA(temp);
13333 kind = PyUnicode_KIND(temp);
13334 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013335 sign = 1;
13336 }
13337 else {
13338 Py_DECREF(iobj);
13339 }
13340 }
13341 }
13342 if (!isnumok) {
13343 PyErr_Format(PyExc_TypeError,
13344 "%%%c format: a number is required, "
13345 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13346 goto onError;
13347 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013348 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013349 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013350 fillobj = zero;
13351 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013352 break;
13353
13354 case 'e':
13355 case 'E':
13356 case 'f':
13357 case 'F':
13358 case 'g':
13359 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013360 temp = formatfloat(v, flags, prec, c);
13361 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000013362 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013363 if (PyUnicode_READY(temp) == -1) {
13364 Py_CLEAR(temp);
13365 goto onError;
13366 }
13367 pbuf = PyUnicode_DATA(temp);
13368 kind = PyUnicode_KIND(temp);
13369 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013370 sign = 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013371 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013372 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013373 fillobj = zero;
13374 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013375 break;
13376
13377 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013378 {
13379 Py_UCS4 ch = formatchar(v);
13380 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013381 goto onError;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013382 temp = _PyUnicode_FromUCS4(&ch, 1);
13383 if (temp == NULL)
13384 goto onError;
13385 pbuf = PyUnicode_DATA(temp);
13386 kind = PyUnicode_KIND(temp);
13387 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013388 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013389 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013390
13391 default:
13392 PyErr_Format(PyExc_ValueError,
13393 "unsupported format character '%c' (0x%x) "
13394 "at index %zd",
13395 (31<=c && c<=126) ? (char)c : '?',
13396 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013397 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013398 goto onError;
13399 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013400 /* pbuf is initialized here. */
13401 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013402 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013403 if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
13404 signobj = minus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013405 len--;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013406 pindex++;
13407 }
13408 else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
13409 signobj = plus;
13410 len--;
13411 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013412 }
13413 else if (flags & F_SIGN)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013414 signobj = plus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013415 else if (flags & F_BLANK)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013416 signobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013417 else
13418 sign = 0;
13419 }
13420 if (width < len)
13421 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013422 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013423 if (fill != ' ') {
13424 assert(signobj != NULL);
13425 if (_PyAccu_Accumulate(&acc, signobj))
13426 goto onError;
13427 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013428 if (width > len)
13429 width--;
13430 }
13431 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013432 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013433 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013434 if (fill != ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013435 second = get_latin1_char(
13436 PyUnicode_READ(kind, pbuf, pindex + 1));
13437 pindex += 2;
13438 if (second == NULL ||
13439 _PyAccu_Accumulate(&acc, zero) ||
13440 _PyAccu_Accumulate(&acc, second))
13441 goto onError;
13442 Py_CLEAR(second);
Benjamin Peterson29060642009-01-31 22:14:21 +000013443 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013444 width -= 2;
13445 if (width < 0)
13446 width = 0;
13447 len -= 2;
13448 }
13449 if (width > len && !(flags & F_LJUST)) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013450 assert(fillobj != NULL);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013451 if (repeat_accumulate(&acc, fillobj, width - len))
13452 goto onError;
13453 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013454 }
13455 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013456 if (sign) {
13457 assert(signobj != NULL);
13458 if (_PyAccu_Accumulate(&acc, signobj))
13459 goto onError;
13460 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013461 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013462 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13463 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013464 second = get_latin1_char(
13465 PyUnicode_READ(kind, pbuf, pindex + 1));
13466 pindex += 2;
13467 if (second == NULL ||
13468 _PyAccu_Accumulate(&acc, zero) ||
13469 _PyAccu_Accumulate(&acc, second))
13470 goto onError;
13471 Py_CLEAR(second);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013472 }
13473 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013474 /* Copy all characters, preserving len */
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013475 if (temp != NULL) {
13476 assert(pbuf == PyUnicode_DATA(temp));
13477 v = PyUnicode_Substring(temp, pindex, pindex + len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013478 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013479 else {
13480 const char *p = (const char *) pbuf;
13481 assert(pbuf != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013482 p += kind * pindex;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013483 v = PyUnicode_FromKindAndData(kind, p, len);
13484 }
13485 if (v == NULL)
13486 goto onError;
13487 r = _PyAccu_Accumulate(&acc, v);
13488 Py_DECREF(v);
13489 if (r)
13490 goto onError;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013491 if (width > len && repeat_accumulate(&acc, blank, width - len))
13492 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013493 if (dict && (argidx < arglen) && c != '%') {
13494 PyErr_SetString(PyExc_TypeError,
13495 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013496 goto onError;
13497 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013498 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013499 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013500 } /* until end */
13501 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013502 PyErr_SetString(PyExc_TypeError,
13503 "not all arguments converted during string formatting");
13504 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013505 }
13506
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013507 result = _PyAccu_Finish(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013508 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013509 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013510 }
13511 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013512 Py_XDECREF(temp);
13513 Py_XDECREF(second);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013514 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013515
Benjamin Peterson29060642009-01-31 22:14:21 +000013516 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013517 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013518 Py_XDECREF(temp);
13519 Py_XDECREF(second);
13520 _PyAccu_Destroy(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013521 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013522 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013523 }
13524 return NULL;
13525}
13526
Jeremy Hylton938ace62002-07-17 16:30:39 +000013527static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013528unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13529
Tim Peters6d6c1a32001-08-02 04:15:00 +000013530static PyObject *
13531unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13532{
Benjamin Peterson29060642009-01-31 22:14:21 +000013533 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013534 static char *kwlist[] = {"object", "encoding", "errors", 0};
13535 char *encoding = NULL;
13536 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013537
Benjamin Peterson14339b62009-01-31 16:36:08 +000013538 if (type != &PyUnicode_Type)
13539 return unicode_subtype_new(type, args, kwds);
13540 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013541 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013542 return NULL;
Victor Stinner382955f2011-12-11 21:44:00 +010013543 if (x == NULL) {
13544 Py_INCREF(unicode_empty);
13545 return unicode_empty;
13546 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013547 if (encoding == NULL && errors == NULL)
13548 return PyObject_Str(x);
13549 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013550 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013551}
13552
Guido van Rossume023fe02001-08-30 03:12:59 +000013553static PyObject *
13554unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13555{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013556 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013557 Py_ssize_t length, char_size;
13558 int share_wstr, share_utf8;
13559 unsigned int kind;
13560 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013561
Benjamin Peterson14339b62009-01-31 16:36:08 +000013562 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013563
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013564 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013565 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013566 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013567 assert(_PyUnicode_CHECK(unicode));
Victor Stinnere06e1452011-10-04 20:52:31 +020013568 if (PyUnicode_READY(unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013569 return NULL;
13570
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013571 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013572 if (self == NULL) {
13573 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013574 return NULL;
13575 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013576 kind = PyUnicode_KIND(unicode);
13577 length = PyUnicode_GET_LENGTH(unicode);
13578
13579 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013580#ifdef Py_DEBUG
13581 _PyUnicode_HASH(self) = -1;
13582#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013583 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013584#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013585 _PyUnicode_STATE(self).interned = 0;
13586 _PyUnicode_STATE(self).kind = kind;
13587 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013588 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013589 _PyUnicode_STATE(self).ready = 1;
13590 _PyUnicode_WSTR(self) = NULL;
13591 _PyUnicode_UTF8_LENGTH(self) = 0;
13592 _PyUnicode_UTF8(self) = NULL;
13593 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013594 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013595
13596 share_utf8 = 0;
13597 share_wstr = 0;
13598 if (kind == PyUnicode_1BYTE_KIND) {
13599 char_size = 1;
13600 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13601 share_utf8 = 1;
13602 }
13603 else if (kind == PyUnicode_2BYTE_KIND) {
13604 char_size = 2;
13605 if (sizeof(wchar_t) == 2)
13606 share_wstr = 1;
13607 }
13608 else {
13609 assert(kind == PyUnicode_4BYTE_KIND);
13610 char_size = 4;
13611 if (sizeof(wchar_t) == 4)
13612 share_wstr = 1;
13613 }
13614
13615 /* Ensure we won't overflow the length. */
13616 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13617 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013618 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013619 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013620 data = PyObject_MALLOC((length + 1) * char_size);
13621 if (data == NULL) {
13622 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013623 goto onError;
13624 }
13625
Victor Stinnerc3c74152011-10-02 20:39:55 +020013626 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013627 if (share_utf8) {
13628 _PyUnicode_UTF8_LENGTH(self) = length;
13629 _PyUnicode_UTF8(self) = data;
13630 }
13631 if (share_wstr) {
13632 _PyUnicode_WSTR_LENGTH(self) = length;
13633 _PyUnicode_WSTR(self) = (wchar_t *)data;
13634 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013635
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013636 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013637 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013638 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013639#ifdef Py_DEBUG
13640 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13641#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020013642 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013643 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013644
13645onError:
13646 Py_DECREF(unicode);
13647 Py_DECREF(self);
13648 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013649}
13650
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013651PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013652 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013653\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013654Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013655encoding defaults to the current default string encoding.\n\
13656errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013657
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013658static PyObject *unicode_iter(PyObject *seq);
13659
Guido van Rossumd57fd912000-03-10 22:53:23 +000013660PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013661 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013662 "str", /* tp_name */
13663 sizeof(PyUnicodeObject), /* tp_size */
13664 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013665 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013666 (destructor)unicode_dealloc, /* tp_dealloc */
13667 0, /* tp_print */
13668 0, /* tp_getattr */
13669 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013670 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013671 unicode_repr, /* tp_repr */
13672 &unicode_as_number, /* tp_as_number */
13673 &unicode_as_sequence, /* tp_as_sequence */
13674 &unicode_as_mapping, /* tp_as_mapping */
13675 (hashfunc) unicode_hash, /* tp_hash*/
13676 0, /* tp_call*/
13677 (reprfunc) unicode_str, /* tp_str */
13678 PyObject_GenericGetAttr, /* tp_getattro */
13679 0, /* tp_setattro */
13680 0, /* tp_as_buffer */
13681 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013682 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013683 unicode_doc, /* tp_doc */
13684 0, /* tp_traverse */
13685 0, /* tp_clear */
13686 PyUnicode_RichCompare, /* tp_richcompare */
13687 0, /* tp_weaklistoffset */
13688 unicode_iter, /* tp_iter */
13689 0, /* tp_iternext */
13690 unicode_methods, /* tp_methods */
13691 0, /* tp_members */
13692 0, /* tp_getset */
13693 &PyBaseObject_Type, /* tp_base */
13694 0, /* tp_dict */
13695 0, /* tp_descr_get */
13696 0, /* tp_descr_set */
13697 0, /* tp_dictoffset */
13698 0, /* tp_init */
13699 0, /* tp_alloc */
13700 unicode_new, /* tp_new */
13701 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013702};
13703
13704/* Initialize the Unicode implementation */
13705
Victor Stinner3a50e702011-10-18 21:21:00 +020013706int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013707{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013708 int i;
13709
Thomas Wouters477c8d52006-05-27 19:21:47 +000013710 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013711 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013712 0x000A, /* LINE FEED */
13713 0x000D, /* CARRIAGE RETURN */
13714 0x001C, /* FILE SEPARATOR */
13715 0x001D, /* GROUP SEPARATOR */
13716 0x001E, /* RECORD SEPARATOR */
13717 0x0085, /* NEXT LINE */
13718 0x2028, /* LINE SEPARATOR */
13719 0x2029, /* PARAGRAPH SEPARATOR */
13720 };
13721
Fred Drakee4315f52000-05-09 19:53:39 +000013722 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013723 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013724 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013725 Py_FatalError("Can't create empty string");
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010013726 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013727
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013728 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013729 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013730 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013731 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013732
13733 /* initialize the linebreak bloom filter */
13734 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013735 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013736 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013737
13738 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020013739
13740#ifdef HAVE_MBCS
13741 winver.dwOSVersionInfoSize = sizeof(winver);
13742 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
13743 PyErr_SetFromWindowsErr(0);
13744 return -1;
13745 }
13746#endif
13747 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013748}
13749
13750/* Finalize the Unicode implementation */
13751
Christian Heimesa156e092008-02-16 07:38:31 +000013752int
13753PyUnicode_ClearFreeList(void)
13754{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013755 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013756}
13757
Guido van Rossumd57fd912000-03-10 22:53:23 +000013758void
Thomas Wouters78890102000-07-22 19:25:51 +000013759_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013760{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013761 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013762
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013763 Py_XDECREF(unicode_empty);
13764 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013765
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013766 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013767 if (unicode_latin1[i]) {
13768 Py_DECREF(unicode_latin1[i]);
13769 unicode_latin1[i] = NULL;
13770 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013771 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020013772 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000013773 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013774}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013775
Walter Dörwald16807132007-05-25 13:52:07 +000013776void
13777PyUnicode_InternInPlace(PyObject **p)
13778{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013779 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013780 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013781#ifdef Py_DEBUG
13782 assert(s != NULL);
13783 assert(_PyUnicode_CHECK(s));
13784#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013785 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013786 return;
13787#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013788 /* If it's a subclass, we don't really know what putting
13789 it in the interned dict might do. */
13790 if (!PyUnicode_CheckExact(s))
13791 return;
13792 if (PyUnicode_CHECK_INTERNED(s))
13793 return;
13794 if (interned == NULL) {
13795 interned = PyDict_New();
13796 if (interned == NULL) {
13797 PyErr_Clear(); /* Don't leave an exception */
13798 return;
13799 }
13800 }
13801 /* It might be that the GetItem call fails even
13802 though the key is present in the dictionary,
13803 namely when this happens during a stack overflow. */
13804 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010013805 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013806 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000013807
Benjamin Peterson29060642009-01-31 22:14:21 +000013808 if (t) {
13809 Py_INCREF(t);
13810 Py_DECREF(*p);
13811 *p = t;
13812 return;
13813 }
Walter Dörwald16807132007-05-25 13:52:07 +000013814
Benjamin Peterson14339b62009-01-31 16:36:08 +000013815 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010013816 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013817 PyErr_Clear();
13818 PyThreadState_GET()->recursion_critical = 0;
13819 return;
13820 }
13821 PyThreadState_GET()->recursion_critical = 0;
13822 /* The two references in interned are not counted by refcnt.
13823 The deallocator will take care of this */
13824 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013825 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013826}
13827
13828void
13829PyUnicode_InternImmortal(PyObject **p)
13830{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013831 PyUnicode_InternInPlace(p);
13832 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020013833 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013834 Py_INCREF(*p);
13835 }
Walter Dörwald16807132007-05-25 13:52:07 +000013836}
13837
13838PyObject *
13839PyUnicode_InternFromString(const char *cp)
13840{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013841 PyObject *s = PyUnicode_FromString(cp);
13842 if (s == NULL)
13843 return NULL;
13844 PyUnicode_InternInPlace(&s);
13845 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000013846}
13847
Alexander Belopolsky40018472011-02-26 01:02:56 +000013848void
13849_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000013850{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013851 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013852 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013853 Py_ssize_t i, n;
13854 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000013855
Benjamin Peterson14339b62009-01-31 16:36:08 +000013856 if (interned == NULL || !PyDict_Check(interned))
13857 return;
13858 keys = PyDict_Keys(interned);
13859 if (keys == NULL || !PyList_Check(keys)) {
13860 PyErr_Clear();
13861 return;
13862 }
Walter Dörwald16807132007-05-25 13:52:07 +000013863
Benjamin Peterson14339b62009-01-31 16:36:08 +000013864 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13865 detector, interned unicode strings are not forcibly deallocated;
13866 rather, we give them their stolen references back, and then clear
13867 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000013868
Benjamin Peterson14339b62009-01-31 16:36:08 +000013869 n = PyList_GET_SIZE(keys);
13870 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000013871 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013872 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013873 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013874 if (PyUnicode_READY(s) == -1) {
13875 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013876 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013877 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013878 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013879 case SSTATE_NOT_INTERNED:
13880 /* XXX Shouldn't happen */
13881 break;
13882 case SSTATE_INTERNED_IMMORTAL:
13883 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013884 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013885 break;
13886 case SSTATE_INTERNED_MORTAL:
13887 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013888 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013889 break;
13890 default:
13891 Py_FatalError("Inconsistent interned string state.");
13892 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013893 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013894 }
13895 fprintf(stderr, "total size of all interned strings: "
13896 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13897 "mortal/immortal\n", mortal_size, immortal_size);
13898 Py_DECREF(keys);
13899 PyDict_Clear(interned);
13900 Py_DECREF(interned);
13901 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000013902}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013903
13904
13905/********************* Unicode Iterator **************************/
13906
13907typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013908 PyObject_HEAD
13909 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013910 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013911} unicodeiterobject;
13912
13913static void
13914unicodeiter_dealloc(unicodeiterobject *it)
13915{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013916 _PyObject_GC_UNTRACK(it);
13917 Py_XDECREF(it->it_seq);
13918 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013919}
13920
13921static int
13922unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13923{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013924 Py_VISIT(it->it_seq);
13925 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013926}
13927
13928static PyObject *
13929unicodeiter_next(unicodeiterobject *it)
13930{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013931 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013932
Benjamin Peterson14339b62009-01-31 16:36:08 +000013933 assert(it != NULL);
13934 seq = it->it_seq;
13935 if (seq == NULL)
13936 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013937 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013938
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013939 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
13940 int kind = PyUnicode_KIND(seq);
13941 void *data = PyUnicode_DATA(seq);
13942 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
13943 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013944 if (item != NULL)
13945 ++it->it_index;
13946 return item;
13947 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013948
Benjamin Peterson14339b62009-01-31 16:36:08 +000013949 Py_DECREF(seq);
13950 it->it_seq = NULL;
13951 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013952}
13953
13954static PyObject *
13955unicodeiter_len(unicodeiterobject *it)
13956{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013957 Py_ssize_t len = 0;
13958 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013959 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013960 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013961}
13962
13963PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
13964
13965static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013966 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000013967 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000013968 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013969};
13970
13971PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013972 PyVarObject_HEAD_INIT(&PyType_Type, 0)
13973 "str_iterator", /* tp_name */
13974 sizeof(unicodeiterobject), /* tp_basicsize */
13975 0, /* tp_itemsize */
13976 /* methods */
13977 (destructor)unicodeiter_dealloc, /* tp_dealloc */
13978 0, /* tp_print */
13979 0, /* tp_getattr */
13980 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013981 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013982 0, /* tp_repr */
13983 0, /* tp_as_number */
13984 0, /* tp_as_sequence */
13985 0, /* tp_as_mapping */
13986 0, /* tp_hash */
13987 0, /* tp_call */
13988 0, /* tp_str */
13989 PyObject_GenericGetAttr, /* tp_getattro */
13990 0, /* tp_setattro */
13991 0, /* tp_as_buffer */
13992 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
13993 0, /* tp_doc */
13994 (traverseproc)unicodeiter_traverse, /* tp_traverse */
13995 0, /* tp_clear */
13996 0, /* tp_richcompare */
13997 0, /* tp_weaklistoffset */
13998 PyObject_SelfIter, /* tp_iter */
13999 (iternextfunc)unicodeiter_next, /* tp_iternext */
14000 unicodeiter_methods, /* tp_methods */
14001 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014002};
14003
14004static PyObject *
14005unicode_iter(PyObject *seq)
14006{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014007 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014008
Benjamin Peterson14339b62009-01-31 16:36:08 +000014009 if (!PyUnicode_Check(seq)) {
14010 PyErr_BadInternalCall();
14011 return NULL;
14012 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014013 if (PyUnicode_READY(seq) == -1)
14014 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014015 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14016 if (it == NULL)
14017 return NULL;
14018 it->it_index = 0;
14019 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014020 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014021 _PyObject_GC_TRACK(it);
14022 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014023}
14024
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014025
14026size_t
14027Py_UNICODE_strlen(const Py_UNICODE *u)
14028{
14029 int res = 0;
14030 while(*u++)
14031 res++;
14032 return res;
14033}
14034
14035Py_UNICODE*
14036Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14037{
14038 Py_UNICODE *u = s1;
14039 while ((*u++ = *s2++));
14040 return s1;
14041}
14042
14043Py_UNICODE*
14044Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14045{
14046 Py_UNICODE *u = s1;
14047 while ((*u++ = *s2++))
14048 if (n-- == 0)
14049 break;
14050 return s1;
14051}
14052
14053Py_UNICODE*
14054Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14055{
14056 Py_UNICODE *u1 = s1;
14057 u1 += Py_UNICODE_strlen(u1);
14058 Py_UNICODE_strcpy(u1, s2);
14059 return s1;
14060}
14061
14062int
14063Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14064{
14065 while (*s1 && *s2 && *s1 == *s2)
14066 s1++, s2++;
14067 if (*s1 && *s2)
14068 return (*s1 < *s2) ? -1 : +1;
14069 if (*s1)
14070 return 1;
14071 if (*s2)
14072 return -1;
14073 return 0;
14074}
14075
14076int
14077Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14078{
14079 register Py_UNICODE u1, u2;
14080 for (; n != 0; n--) {
14081 u1 = *s1;
14082 u2 = *s2;
14083 if (u1 != u2)
14084 return (u1 < u2) ? -1 : +1;
14085 if (u1 == '\0')
14086 return 0;
14087 s1++;
14088 s2++;
14089 }
14090 return 0;
14091}
14092
14093Py_UNICODE*
14094Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14095{
14096 const Py_UNICODE *p;
14097 for (p = s; *p; p++)
14098 if (*p == c)
14099 return (Py_UNICODE*)p;
14100 return NULL;
14101}
14102
14103Py_UNICODE*
14104Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14105{
14106 const Py_UNICODE *p;
14107 p = s + Py_UNICODE_strlen(s);
14108 while (p != s) {
14109 p--;
14110 if (*p == c)
14111 return (Py_UNICODE*)p;
14112 }
14113 return NULL;
14114}
Victor Stinner331ea922010-08-10 16:37:20 +000014115
Victor Stinner71133ff2010-09-01 23:43:53 +000014116Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014117PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014118{
Victor Stinner577db2c2011-10-11 22:12:48 +020014119 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014120 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014121
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014122 if (!PyUnicode_Check(unicode)) {
14123 PyErr_BadArgument();
14124 return NULL;
14125 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014126 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014127 if (u == NULL)
14128 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014129 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014130 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014131 PyErr_NoMemory();
14132 return NULL;
14133 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014134 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014135 size *= sizeof(Py_UNICODE);
14136 copy = PyMem_Malloc(size);
14137 if (copy == NULL) {
14138 PyErr_NoMemory();
14139 return NULL;
14140 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014141 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014142 return copy;
14143}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014144
Georg Brandl66c221e2010-10-14 07:04:07 +000014145/* A _string module, to export formatter_parser and formatter_field_name_split
14146 to the string.Formatter class implemented in Python. */
14147
14148static PyMethodDef _string_methods[] = {
14149 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14150 METH_O, PyDoc_STR("split the argument as a field name")},
14151 {"formatter_parser", (PyCFunction) formatter_parser,
14152 METH_O, PyDoc_STR("parse the argument as a format string")},
14153 {NULL, NULL}
14154};
14155
14156static struct PyModuleDef _string_module = {
14157 PyModuleDef_HEAD_INIT,
14158 "_string",
14159 PyDoc_STR("string helper module"),
14160 0,
14161 _string_methods,
14162 NULL,
14163 NULL,
14164 NULL,
14165 NULL
14166};
14167
14168PyMODINIT_FUNC
14169PyInit__string(void)
14170{
14171 return PyModule_Create(&_string_module);
14172}
14173
14174
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014175#ifdef __cplusplus
14176}
14177#endif