blob: 6edba5d597c1c904810ad18c559a2dbd37335df6 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Guido van Rossumd57fd912000-03-10 22:53:23 +000049/* Endianness switches; defaults to little endian */
50
51#ifdef WORDS_BIGENDIAN
52# define BYTEORDER_IS_BIG_ENDIAN
53#else
54# define BYTEORDER_IS_LITTLE_ENDIAN
55#endif
56
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000057/* --- Globals ------------------------------------------------------------
58
59 The globals are initialized by the _PyUnicode_Init() API and should
60 not be used before calling that API.
61
62*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000063
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000064
65#ifdef __cplusplus
66extern "C" {
67#endif
68
Victor Stinner8faf8212011-12-08 22:14:11 +010069/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
70#define MAX_UNICODE 0x10ffff
71
Victor Stinner910337b2011-10-03 03:20:16 +020072#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020073# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020074#else
75# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
76#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020077
Victor Stinnere90fe6a2011-10-01 16:48:13 +020078#define _PyUnicode_UTF8(op) \
79 (((PyCompactUnicodeObject*)(op))->utf8)
80#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020081 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020082 assert(PyUnicode_IS_READY(op)), \
83 PyUnicode_IS_COMPACT_ASCII(op) ? \
84 ((char*)((PyASCIIObject*)(op) + 1)) : \
85 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020086#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020087 (((PyCompactUnicodeObject*)(op))->utf8_length)
88#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020089 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020090 assert(PyUnicode_IS_READY(op)), \
91 PyUnicode_IS_COMPACT_ASCII(op) ? \
92 ((PyASCIIObject*)(op))->length : \
93 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020094#define _PyUnicode_WSTR(op) \
95 (((PyASCIIObject*)(op))->wstr)
96#define _PyUnicode_WSTR_LENGTH(op) \
97 (((PyCompactUnicodeObject*)(op))->wstr_length)
98#define _PyUnicode_LENGTH(op) \
99 (((PyASCIIObject *)(op))->length)
100#define _PyUnicode_STATE(op) \
101 (((PyASCIIObject *)(op))->state)
102#define _PyUnicode_HASH(op) \
103 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200104#define _PyUnicode_KIND(op) \
105 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200107#define _PyUnicode_GET_LENGTH(op) \
108 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200109 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200110#define _PyUnicode_DATA_ANY(op) \
111 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200112
Victor Stinner910337b2011-10-03 03:20:16 +0200113#undef PyUnicode_READY
114#define PyUnicode_READY(op) \
115 (assert(_PyUnicode_CHECK(op)), \
116 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200117 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100118 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200119
Victor Stinnerc379ead2011-10-03 12:52:27 +0200120#define _PyUnicode_SHARE_UTF8(op) \
121 (assert(_PyUnicode_CHECK(op)), \
122 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
123 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
124#define _PyUnicode_SHARE_WSTR(op) \
125 (assert(_PyUnicode_CHECK(op)), \
126 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
127
Victor Stinner829c0ad2011-10-03 01:08:02 +0200128/* true if the Unicode object has an allocated UTF-8 memory block
129 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200130#define _PyUnicode_HAS_UTF8_MEMORY(op) \
131 (assert(_PyUnicode_CHECK(op)), \
132 (!PyUnicode_IS_COMPACT_ASCII(op) \
133 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200134 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
135
Victor Stinner03490912011-10-03 23:45:12 +0200136/* true if the Unicode object has an allocated wstr memory block
137 (not shared with other data) */
138#define _PyUnicode_HAS_WSTR_MEMORY(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 (_PyUnicode_WSTR(op) && \
141 (!PyUnicode_IS_READY(op) || \
142 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
143
Victor Stinner910337b2011-10-03 03:20:16 +0200144/* Generic helper macro to convert characters of different types.
145 from_type and to_type have to be valid type names, begin and end
146 are pointers to the source characters which should be of type
147 "from_type *". to is a pointer of type "to_type *" and points to the
148 buffer where the result characters are written to. */
149#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
150 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200151 to_type *_to = (to_type *) to; \
152 const from_type *_iter = (begin); \
153 const from_type *_end = (end); \
154 Py_ssize_t n = (_end) - (_iter); \
155 const from_type *_unrolled_end = \
156 _iter + (n & ~ (Py_ssize_t) 3); \
157 while (_iter < (_unrolled_end)) { \
158 _to[0] = (to_type) _iter[0]; \
159 _to[1] = (to_type) _iter[1]; \
160 _to[2] = (to_type) _iter[2]; \
161 _to[3] = (to_type) _iter[3]; \
162 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200163 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200164 while (_iter < (_end)) \
165 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200166 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200167
Walter Dörwald16807132007-05-25 13:52:07 +0000168/* This dictionary holds all interned unicode strings. Note that references
169 to strings in this dictionary are *not* counted in the string's ob_refcnt.
170 When the interned string reaches a refcnt of 0 the string deallocation
171 function will delete the reference from this dictionary.
172
173 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000174 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000175*/
176static PyObject *interned;
177
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000178/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200179static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000180
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200181/* List of static strings. */
182static _Py_Identifier *static_strings;
183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184/* Single character Unicode strings in the Latin-1 range are being
185 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200186static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000187
Christian Heimes190d79e2008-01-30 11:58:22 +0000188/* Fast detection of the most frequent whitespace characters */
189const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000190 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000191/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000192/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000193/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000194/* case 0x000C: * FORM FEED */
195/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000196 0, 1, 1, 1, 1, 1, 0, 0,
197 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000198/* case 0x001C: * FILE SEPARATOR */
199/* case 0x001D: * GROUP SEPARATOR */
200/* case 0x001E: * RECORD SEPARATOR */
201/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000202 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000203/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000204 1, 0, 0, 0, 0, 0, 0, 0,
205 0, 0, 0, 0, 0, 0, 0, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
207 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000208
Benjamin Peterson14339b62009-01-31 16:36:08 +0000209 0, 0, 0, 0, 0, 0, 0, 0,
210 0, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000217};
218
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200219/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200220static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200221static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200222static void copy_characters(
223 PyObject *to, Py_ssize_t to_start,
224 PyObject *from, Py_ssize_t from_start,
225 Py_ssize_t how_many);
Victor Stinner488fa492011-12-12 00:01:39 +0100226static int unicode_modifiable(PyObject *unicode);
227
Victor Stinnerfe226c02011-10-03 03:52:20 +0200228
Alexander Belopolsky40018472011-02-26 01:02:56 +0000229static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200230unicode_fromascii(const unsigned char *s, Py_ssize_t size);
231static PyObject *
232_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
233static PyObject *
234_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
235static PyObject *
236_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
237
238static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000239unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000240 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100241 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000242 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
243
Alexander Belopolsky40018472011-02-26 01:02:56 +0000244static void
245raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300246 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100247 PyObject *unicode,
248 Py_ssize_t startpos, Py_ssize_t endpos,
249 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000250
Christian Heimes190d79e2008-01-30 11:58:22 +0000251/* Same for linebreaks */
252static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000253 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000254/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000255/* 0x000B, * LINE TABULATION */
256/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000257/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000258 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000259 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000260/* 0x001C, * FILE SEPARATOR */
261/* 0x001D, * GROUP SEPARATOR */
262/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000263 0, 0, 0, 0, 1, 1, 1, 0,
264 0, 0, 0, 0, 0, 0, 0, 0,
265 0, 0, 0, 0, 0, 0, 0, 0,
266 0, 0, 0, 0, 0, 0, 0, 0,
267 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000268
Benjamin Peterson14339b62009-01-31 16:36:08 +0000269 0, 0, 0, 0, 0, 0, 0, 0,
270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000277};
278
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300279/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
280 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000281Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000282PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000283{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000284#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000285 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000286#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000287 /* This is actually an illegal character, so it should
288 not be passed to unichr. */
289 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000290#endif
291}
292
Victor Stinner910337b2011-10-03 03:20:16 +0200293#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200294int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100295_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200296{
297 PyASCIIObject *ascii;
298 unsigned int kind;
299
300 assert(PyUnicode_Check(op));
301
302 ascii = (PyASCIIObject *)op;
303 kind = ascii->state.kind;
304
Victor Stinnera3b334d2011-10-03 13:53:37 +0200305 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200306 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200307 assert(ascii->state.ready == 1);
308 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200309 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200310 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200311 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200312
Victor Stinnera41463c2011-10-04 01:05:08 +0200313 if (ascii->state.compact == 1) {
314 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200315 assert(kind == PyUnicode_1BYTE_KIND
316 || kind == PyUnicode_2BYTE_KIND
317 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200318 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200319 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200320 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100321 }
322 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200323 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
324
325 data = unicode->data.any;
326 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100327 assert(ascii->length == 0);
328 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200329 assert(ascii->state.compact == 0);
330 assert(ascii->state.ascii == 0);
331 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100332 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200333 assert(ascii->wstr != NULL);
334 assert(data == NULL);
335 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200336 }
337 else {
338 assert(kind == PyUnicode_1BYTE_KIND
339 || kind == PyUnicode_2BYTE_KIND
340 || kind == PyUnicode_4BYTE_KIND);
341 assert(ascii->state.compact == 0);
342 assert(ascii->state.ready == 1);
343 assert(data != NULL);
344 if (ascii->state.ascii) {
345 assert (compact->utf8 == data);
346 assert (compact->utf8_length == ascii->length);
347 }
348 else
349 assert (compact->utf8 != data);
350 }
351 }
352 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200353 if (
354#if SIZEOF_WCHAR_T == 2
355 kind == PyUnicode_2BYTE_KIND
356#else
357 kind == PyUnicode_4BYTE_KIND
358#endif
359 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200360 {
361 assert(ascii->wstr == data);
362 assert(compact->wstr_length == ascii->length);
363 } else
364 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200365 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200366
367 if (compact->utf8 == NULL)
368 assert(compact->utf8_length == 0);
369 if (ascii->wstr == NULL)
370 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200371 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200372 /* check that the best kind is used */
373 if (check_content && kind != PyUnicode_WCHAR_KIND)
374 {
375 Py_ssize_t i;
376 Py_UCS4 maxchar = 0;
377 void *data = PyUnicode_DATA(ascii);
378 for (i=0; i < ascii->length; i++)
379 {
380 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
381 if (ch > maxchar)
382 maxchar = ch;
383 }
384 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100385 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200386 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100387 assert(maxchar <= 255);
388 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200389 else
390 assert(maxchar < 128);
391 }
Victor Stinner77faf692011-11-20 18:56:05 +0100392 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200393 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100394 assert(maxchar <= 0xFFFF);
395 }
396 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200397 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100398 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100399 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200400 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400401 return 1;
402}
Victor Stinner910337b2011-10-03 03:20:16 +0200403#endif
404
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100405static PyObject*
406unicode_result_wchar(PyObject *unicode)
407{
408#ifndef Py_DEBUG
409 Py_ssize_t len;
410
411 assert(Py_REFCNT(unicode) == 1);
412
413 len = _PyUnicode_WSTR_LENGTH(unicode);
414 if (len == 0) {
415 Py_INCREF(unicode_empty);
416 Py_DECREF(unicode);
417 return unicode_empty;
418 }
419
420 if (len == 1) {
421 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
422 if (ch < 256) {
423 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
424 Py_DECREF(unicode);
425 return latin1_char;
426 }
427 }
428
429 if (_PyUnicode_Ready(unicode) < 0) {
430 Py_XDECREF(unicode);
431 return NULL;
432 }
433#else
434 /* don't make the result ready in debug mode to ensure that the caller
435 makes the string ready before using it */
436 assert(_PyUnicode_CheckConsistency(unicode, 1));
437#endif
438 return unicode;
439}
440
441static PyObject*
442unicode_result_ready(PyObject *unicode)
443{
444 Py_ssize_t length;
445
446 length = PyUnicode_GET_LENGTH(unicode);
447 if (length == 0) {
448 if (unicode != unicode_empty) {
449 Py_INCREF(unicode_empty);
450 Py_DECREF(unicode);
451 }
452 return unicode_empty;
453 }
454
455 if (length == 1) {
456 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
457 if (ch < 256) {
458 PyObject *latin1_char = unicode_latin1[ch];
459 if (latin1_char != NULL) {
460 if (unicode != latin1_char) {
461 Py_INCREF(latin1_char);
462 Py_DECREF(unicode);
463 }
464 return latin1_char;
465 }
466 else {
467 assert(_PyUnicode_CheckConsistency(unicode, 1));
468 Py_INCREF(unicode);
469 unicode_latin1[ch] = unicode;
470 return unicode;
471 }
472 }
473 }
474
475 assert(_PyUnicode_CheckConsistency(unicode, 1));
476 return unicode;
477}
478
479static PyObject*
480unicode_result(PyObject *unicode)
481{
482 assert(_PyUnicode_CHECK(unicode));
483 if (PyUnicode_IS_READY(unicode))
484 return unicode_result_ready(unicode);
485 else
486 return unicode_result_wchar(unicode);
487}
488
Victor Stinnerc4b49542011-12-11 22:44:26 +0100489static PyObject*
490unicode_result_unchanged(PyObject *unicode)
491{
492 if (PyUnicode_CheckExact(unicode)) {
493 if (PyUnicode_READY(unicode) < 0)
494 return NULL;
495 Py_INCREF(unicode);
496 return unicode;
497 }
498 else
499 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100500 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100501}
502
Victor Stinner3a50e702011-10-18 21:21:00 +0200503#ifdef HAVE_MBCS
504static OSVERSIONINFOEX winver;
505#endif
506
Thomas Wouters477c8d52006-05-27 19:21:47 +0000507/* --- Bloom Filters ----------------------------------------------------- */
508
509/* stuff to implement simple "bloom filters" for Unicode characters.
510 to keep things simple, we use a single bitmask, using the least 5
511 bits from each unicode characters as the bit index. */
512
513/* the linebreak mask is set up by Unicode_Init below */
514
Antoine Pitrouf068f942010-01-13 14:19:12 +0000515#if LONG_BIT >= 128
516#define BLOOM_WIDTH 128
517#elif LONG_BIT >= 64
518#define BLOOM_WIDTH 64
519#elif LONG_BIT >= 32
520#define BLOOM_WIDTH 32
521#else
522#error "LONG_BIT is smaller than 32"
523#endif
524
Thomas Wouters477c8d52006-05-27 19:21:47 +0000525#define BLOOM_MASK unsigned long
526
527static BLOOM_MASK bloom_linebreak;
528
Antoine Pitrouf068f942010-01-13 14:19:12 +0000529#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
530#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000531
Benjamin Peterson29060642009-01-31 22:14:21 +0000532#define BLOOM_LINEBREAK(ch) \
533 ((ch) < 128U ? ascii_linebreak[(ch)] : \
534 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000535
Alexander Belopolsky40018472011-02-26 01:02:56 +0000536Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200537make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000538{
539 /* calculate simple bloom-style bitmask for a given unicode string */
540
Antoine Pitrouf068f942010-01-13 14:19:12 +0000541 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000542 Py_ssize_t i;
543
544 mask = 0;
545 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200546 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000547
548 return mask;
549}
550
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200551#define BLOOM_MEMBER(mask, chr, str) \
552 (BLOOM(mask, chr) \
553 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000554
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200555/* Compilation of templated routines */
556
557#include "stringlib/asciilib.h"
558#include "stringlib/fastsearch.h"
559#include "stringlib/partition.h"
560#include "stringlib/split.h"
561#include "stringlib/count.h"
562#include "stringlib/find.h"
563#include "stringlib/find_max_char.h"
564#include "stringlib/localeutil.h"
565#include "stringlib/undef.h"
566
567#include "stringlib/ucs1lib.h"
568#include "stringlib/fastsearch.h"
569#include "stringlib/partition.h"
570#include "stringlib/split.h"
571#include "stringlib/count.h"
572#include "stringlib/find.h"
573#include "stringlib/find_max_char.h"
574#include "stringlib/localeutil.h"
575#include "stringlib/undef.h"
576
577#include "stringlib/ucs2lib.h"
578#include "stringlib/fastsearch.h"
579#include "stringlib/partition.h"
580#include "stringlib/split.h"
581#include "stringlib/count.h"
582#include "stringlib/find.h"
583#include "stringlib/find_max_char.h"
584#include "stringlib/localeutil.h"
585#include "stringlib/undef.h"
586
587#include "stringlib/ucs4lib.h"
588#include "stringlib/fastsearch.h"
589#include "stringlib/partition.h"
590#include "stringlib/split.h"
591#include "stringlib/count.h"
592#include "stringlib/find.h"
593#include "stringlib/find_max_char.h"
594#include "stringlib/localeutil.h"
595#include "stringlib/undef.h"
596
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200597#include "stringlib/unicodedefs.h"
598#include "stringlib/fastsearch.h"
599#include "stringlib/count.h"
600#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100601#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200602
Guido van Rossumd57fd912000-03-10 22:53:23 +0000603/* --- Unicode Object ----------------------------------------------------- */
604
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200605static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200606fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200607
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200608Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
609 Py_ssize_t size, Py_UCS4 ch,
610 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200611{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200612 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
613
614 switch (kind) {
615 case PyUnicode_1BYTE_KIND:
616 {
617 Py_UCS1 ch1 = (Py_UCS1) ch;
618 if (ch1 == ch)
619 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
620 else
621 return -1;
622 }
623 case PyUnicode_2BYTE_KIND:
624 {
625 Py_UCS2 ch2 = (Py_UCS2) ch;
626 if (ch2 == ch)
627 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
628 else
629 return -1;
630 }
631 case PyUnicode_4BYTE_KIND:
632 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
633 default:
634 assert(0);
635 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200636 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200637}
638
Victor Stinnerfe226c02011-10-03 03:52:20 +0200639static PyObject*
640resize_compact(PyObject *unicode, Py_ssize_t length)
641{
642 Py_ssize_t char_size;
643 Py_ssize_t struct_size;
644 Py_ssize_t new_size;
645 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100646 PyObject *new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200647 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100648 assert(PyUnicode_IS_COMPACT(unicode));
649
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200650 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100651 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200652 struct_size = sizeof(PyASCIIObject);
653 else
654 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200655 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200656
Victor Stinnerfe226c02011-10-03 03:52:20 +0200657 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
658 PyErr_NoMemory();
659 return NULL;
660 }
661 new_size = (struct_size + (length + 1) * char_size);
662
Victor Stinner84def372011-12-11 20:04:56 +0100663 _Py_DEC_REFTOTAL;
664 _Py_ForgetReference(unicode);
665
666 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
667 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100668 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200669 PyErr_NoMemory();
670 return NULL;
671 }
Victor Stinner84def372011-12-11 20:04:56 +0100672 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200673 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100674
Victor Stinnerfe226c02011-10-03 03:52:20 +0200675 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200676 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200677 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100678 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200679 _PyUnicode_WSTR_LENGTH(unicode) = length;
680 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200681 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
682 length, 0);
683 return unicode;
684}
685
Alexander Belopolsky40018472011-02-26 01:02:56 +0000686static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200687resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000688{
Victor Stinner95663112011-10-04 01:03:50 +0200689 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100690 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200691 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200692 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000693
Victor Stinnerfe226c02011-10-03 03:52:20 +0200694 if (PyUnicode_IS_READY(unicode)) {
695 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200696 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200697 void *data;
698
699 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200700 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200701 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
702 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200703
704 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
705 PyErr_NoMemory();
706 return -1;
707 }
708 new_size = (length + 1) * char_size;
709
Victor Stinner7a9105a2011-12-12 00:13:42 +0100710 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
711 {
712 PyObject_DEL(_PyUnicode_UTF8(unicode));
713 _PyUnicode_UTF8(unicode) = NULL;
714 _PyUnicode_UTF8_LENGTH(unicode) = 0;
715 }
716
Victor Stinnerfe226c02011-10-03 03:52:20 +0200717 data = (PyObject *)PyObject_REALLOC(data, new_size);
718 if (data == NULL) {
719 PyErr_NoMemory();
720 return -1;
721 }
722 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200723 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200724 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200725 _PyUnicode_WSTR_LENGTH(unicode) = length;
726 }
727 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200728 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200729 _PyUnicode_UTF8_LENGTH(unicode) = length;
730 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200731 _PyUnicode_LENGTH(unicode) = length;
732 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200733 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200734 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200735 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200736 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200737 }
Victor Stinner95663112011-10-04 01:03:50 +0200738 assert(_PyUnicode_WSTR(unicode) != NULL);
739
740 /* check for integer overflow */
741 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
742 PyErr_NoMemory();
743 return -1;
744 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100745 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200746 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100747 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200748 if (!wstr) {
749 PyErr_NoMemory();
750 return -1;
751 }
752 _PyUnicode_WSTR(unicode) = wstr;
753 _PyUnicode_WSTR(unicode)[length] = 0;
754 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200755 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000756 return 0;
757}
758
Victor Stinnerfe226c02011-10-03 03:52:20 +0200759static PyObject*
760resize_copy(PyObject *unicode, Py_ssize_t length)
761{
762 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100763 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200764 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100765
766 if (PyUnicode_READY(unicode) < 0)
767 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200768
769 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
770 if (copy == NULL)
771 return NULL;
772
773 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200774 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200775 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200776 }
777 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200778 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100779
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200780 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200781 if (w == NULL)
782 return NULL;
783 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
784 copy_length = Py_MIN(copy_length, length);
785 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
786 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200787 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200788 }
789}
790
Guido van Rossumd57fd912000-03-10 22:53:23 +0000791/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000792 Ux0000 terminated; some code (e.g. new_identifier)
793 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000794
795 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000796 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000797
798*/
799
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200800#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200801static int unicode_old_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200802#endif
803
Alexander Belopolsky40018472011-02-26 01:02:56 +0000804static PyUnicodeObject *
805_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000806{
807 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200808 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000809
Thomas Wouters477c8d52006-05-27 19:21:47 +0000810 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000811 if (length == 0 && unicode_empty != NULL) {
812 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200813 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000814 }
815
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000816 /* Ensure we won't overflow the size. */
817 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
818 return (PyUnicodeObject *)PyErr_NoMemory();
819 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200820 if (length < 0) {
821 PyErr_SetString(PyExc_SystemError,
822 "Negative size passed to _PyUnicode_New");
823 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000824 }
825
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200826#ifdef Py_DEBUG
827 ++unicode_old_new_calls;
828#endif
829
830 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
831 if (unicode == NULL)
832 return NULL;
833 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
834 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
835 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100836 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000837 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100838 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000839 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200840
Jeremy Hyltond8082792003-09-16 19:41:39 +0000841 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000842 * the caller fails before initializing str -- unicode_resize()
843 * reads str[0], and the Keep-Alive optimization can keep memory
844 * allocated for str alive across a call to unicode_dealloc(unicode).
845 * We don't want unicode_resize to read uninitialized memory in
846 * that case.
847 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200848 _PyUnicode_WSTR(unicode)[0] = 0;
849 _PyUnicode_WSTR(unicode)[length] = 0;
850 _PyUnicode_WSTR_LENGTH(unicode) = length;
851 _PyUnicode_HASH(unicode) = -1;
852 _PyUnicode_STATE(unicode).interned = 0;
853 _PyUnicode_STATE(unicode).kind = 0;
854 _PyUnicode_STATE(unicode).compact = 0;
855 _PyUnicode_STATE(unicode).ready = 0;
856 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200857 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200858 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200859 _PyUnicode_UTF8(unicode) = NULL;
860 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100861 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000862 return unicode;
863}
864
Victor Stinnerf42dc442011-10-02 23:33:16 +0200865static const char*
866unicode_kind_name(PyObject *unicode)
867{
Victor Stinner42dfd712011-10-03 14:41:45 +0200868 /* don't check consistency: unicode_kind_name() is called from
869 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200870 if (!PyUnicode_IS_COMPACT(unicode))
871 {
872 if (!PyUnicode_IS_READY(unicode))
873 return "wstr";
874 switch(PyUnicode_KIND(unicode))
875 {
876 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200877 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200878 return "legacy ascii";
879 else
880 return "legacy latin1";
881 case PyUnicode_2BYTE_KIND:
882 return "legacy UCS2";
883 case PyUnicode_4BYTE_KIND:
884 return "legacy UCS4";
885 default:
886 return "<legacy invalid kind>";
887 }
888 }
889 assert(PyUnicode_IS_READY(unicode));
890 switch(PyUnicode_KIND(unicode))
891 {
892 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200893 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200894 return "ascii";
895 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200896 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200897 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200898 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200899 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200900 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200901 default:
902 return "<invalid compact kind>";
903 }
904}
905
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200906#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200907static int unicode_new_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200908
909/* Functions wrapping macros for use in debugger */
910char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200911 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200912}
913
914void *_PyUnicode_compact_data(void *unicode) {
915 return _PyUnicode_COMPACT_DATA(unicode);
916}
917void *_PyUnicode_data(void *unicode){
918 printf("obj %p\n", unicode);
919 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
920 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
921 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
922 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
923 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
924 return PyUnicode_DATA(unicode);
925}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200926
927void
928_PyUnicode_Dump(PyObject *op)
929{
930 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200931 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
932 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
933 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200934
Victor Stinnera849a4b2011-10-03 12:12:11 +0200935 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200936 {
937 if (ascii->state.ascii)
938 data = (ascii + 1);
939 else
940 data = (compact + 1);
941 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200942 else
943 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200944 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
945
Victor Stinnera849a4b2011-10-03 12:12:11 +0200946 if (ascii->wstr == data)
947 printf("shared ");
948 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200949
Victor Stinnera3b334d2011-10-03 13:53:37 +0200950 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200951 printf(" (%zu), ", compact->wstr_length);
952 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
953 printf("shared ");
954 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200955 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200956 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200957}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200958#endif
959
960PyObject *
961PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
962{
963 PyObject *obj;
964 PyCompactUnicodeObject *unicode;
965 void *data;
966 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200967 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200968 Py_ssize_t char_size;
969 Py_ssize_t struct_size;
970
971 /* Optimization for empty strings */
972 if (size == 0 && unicode_empty != NULL) {
973 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200974 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200975 }
976
977#ifdef Py_DEBUG
978 ++unicode_new_new_calls;
979#endif
980
Victor Stinner9e9d6892011-10-04 01:02:02 +0200981 is_ascii = 0;
982 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200983 struct_size = sizeof(PyCompactUnicodeObject);
984 if (maxchar < 128) {
985 kind_state = PyUnicode_1BYTE_KIND;
986 char_size = 1;
987 is_ascii = 1;
988 struct_size = sizeof(PyASCIIObject);
989 }
990 else if (maxchar < 256) {
991 kind_state = PyUnicode_1BYTE_KIND;
992 char_size = 1;
993 }
994 else if (maxchar < 65536) {
995 kind_state = PyUnicode_2BYTE_KIND;
996 char_size = 2;
997 if (sizeof(wchar_t) == 2)
998 is_sharing = 1;
999 }
1000 else {
1001 kind_state = PyUnicode_4BYTE_KIND;
1002 char_size = 4;
1003 if (sizeof(wchar_t) == 4)
1004 is_sharing = 1;
1005 }
1006
1007 /* Ensure we won't overflow the size. */
1008 if (size < 0) {
1009 PyErr_SetString(PyExc_SystemError,
1010 "Negative size passed to PyUnicode_New");
1011 return NULL;
1012 }
1013 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1014 return PyErr_NoMemory();
1015
1016 /* Duplicated allocation code from _PyObject_New() instead of a call to
1017 * PyObject_New() so we are able to allocate space for the object and
1018 * it's data buffer.
1019 */
1020 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1021 if (obj == NULL)
1022 return PyErr_NoMemory();
1023 obj = PyObject_INIT(obj, &PyUnicode_Type);
1024 if (obj == NULL)
1025 return NULL;
1026
1027 unicode = (PyCompactUnicodeObject *)obj;
1028 if (is_ascii)
1029 data = ((PyASCIIObject*)obj) + 1;
1030 else
1031 data = unicode + 1;
1032 _PyUnicode_LENGTH(unicode) = size;
1033 _PyUnicode_HASH(unicode) = -1;
1034 _PyUnicode_STATE(unicode).interned = 0;
1035 _PyUnicode_STATE(unicode).kind = kind_state;
1036 _PyUnicode_STATE(unicode).compact = 1;
1037 _PyUnicode_STATE(unicode).ready = 1;
1038 _PyUnicode_STATE(unicode).ascii = is_ascii;
1039 if (is_ascii) {
1040 ((char*)data)[size] = 0;
1041 _PyUnicode_WSTR(unicode) = NULL;
1042 }
1043 else if (kind_state == PyUnicode_1BYTE_KIND) {
1044 ((char*)data)[size] = 0;
1045 _PyUnicode_WSTR(unicode) = NULL;
1046 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001047 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001048 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001049 }
1050 else {
1051 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001052 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001053 if (kind_state == PyUnicode_2BYTE_KIND)
1054 ((Py_UCS2*)data)[size] = 0;
1055 else /* kind_state == PyUnicode_4BYTE_KIND */
1056 ((Py_UCS4*)data)[size] = 0;
1057 if (is_sharing) {
1058 _PyUnicode_WSTR_LENGTH(unicode) = size;
1059 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1060 }
1061 else {
1062 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1063 _PyUnicode_WSTR(unicode) = NULL;
1064 }
1065 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01001066 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001067 return obj;
1068}
1069
1070#if SIZEOF_WCHAR_T == 2
1071/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1072 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001073 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001074
1075 This function assumes that unicode can hold one more code point than wstr
1076 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001077static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001078unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001079 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001080{
1081 const wchar_t *iter;
1082 Py_UCS4 *ucs4_out;
1083
Victor Stinner910337b2011-10-03 03:20:16 +02001084 assert(unicode != NULL);
1085 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001086 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1087 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1088
1089 for (iter = begin; iter < end; ) {
1090 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1091 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001092 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1093 && (iter+1) < end
1094 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001095 {
Victor Stinner551ac952011-11-29 22:58:13 +01001096 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001097 iter += 2;
1098 }
1099 else {
1100 *ucs4_out++ = *iter;
1101 iter++;
1102 }
1103 }
1104 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1105 _PyUnicode_GET_LENGTH(unicode)));
1106
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001107}
1108#endif
1109
Victor Stinnercd9950f2011-10-02 00:34:53 +02001110static int
Victor Stinner488fa492011-12-12 00:01:39 +01001111unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001112{
Victor Stinner488fa492011-12-12 00:01:39 +01001113 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001114 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001115 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001116 return -1;
1117 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001118 return 0;
1119}
1120
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001121static int
1122_copy_characters(PyObject *to, Py_ssize_t to_start,
1123 PyObject *from, Py_ssize_t from_start,
1124 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001125{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001126 unsigned int from_kind, to_kind;
1127 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001128 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001129
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001130 assert(PyUnicode_Check(from));
1131 assert(PyUnicode_Check(to));
1132 assert(PyUnicode_IS_READY(from));
1133 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001134
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001135 assert(PyUnicode_GET_LENGTH(from) >= how_many);
1136 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1137 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001138
Victor Stinnerf5ca1a22011-09-28 23:54:59 +02001139 if (how_many == 0)
1140 return 0;
1141
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001142 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001143 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001144 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001145 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001146
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001147#ifdef Py_DEBUG
1148 if (!check_maxchar
1149 && (from_kind > to_kind
1150 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001151 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001152 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1153 Py_UCS4 ch;
1154 Py_ssize_t i;
1155 for (i=0; i < how_many; i++) {
1156 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1157 assert(ch <= to_maxchar);
1158 }
1159 }
1160#endif
1161 fast = (from_kind == to_kind);
1162 if (check_maxchar
1163 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1164 {
1165 /* deny latin1 => ascii */
1166 fast = 0;
1167 }
1168
1169 if (fast) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001170 Py_MEMCPY((char*)to_data + to_kind * to_start,
1171 (char*)from_data + from_kind * from_start,
1172 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001173 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001174 else if (from_kind == PyUnicode_1BYTE_KIND
1175 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001176 {
1177 _PyUnicode_CONVERT_BYTES(
1178 Py_UCS1, Py_UCS2,
1179 PyUnicode_1BYTE_DATA(from) + from_start,
1180 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1181 PyUnicode_2BYTE_DATA(to) + to_start
1182 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001183 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001184 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001185 && to_kind == PyUnicode_4BYTE_KIND)
1186 {
1187 _PyUnicode_CONVERT_BYTES(
1188 Py_UCS1, Py_UCS4,
1189 PyUnicode_1BYTE_DATA(from) + from_start,
1190 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1191 PyUnicode_4BYTE_DATA(to) + to_start
1192 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001193 }
1194 else if (from_kind == PyUnicode_2BYTE_KIND
1195 && to_kind == PyUnicode_4BYTE_KIND)
1196 {
1197 _PyUnicode_CONVERT_BYTES(
1198 Py_UCS2, Py_UCS4,
1199 PyUnicode_2BYTE_DATA(from) + from_start,
1200 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1201 PyUnicode_4BYTE_DATA(to) + to_start
1202 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001203 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001204 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001205 /* check if max_char(from substring) <= max_char(to) */
1206 if (from_kind > to_kind
1207 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001208 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001209 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001210 /* slow path to check for character overflow */
1211 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001212 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001213 Py_ssize_t i;
1214
Victor Stinner56c161a2011-10-06 02:47:11 +02001215#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001216 for (i=0; i < how_many; i++) {
1217 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001218 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001219 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1220 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001221#else
1222 if (!check_maxchar) {
1223 for (i=0; i < how_many; i++) {
1224 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1225 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1226 }
1227 }
1228 else {
1229 for (i=0; i < how_many; i++) {
1230 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1231 if (ch > to_maxchar)
1232 return 1;
1233 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1234 }
1235 }
1236#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001237 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001238 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001239 assert(0 && "inconsistent state");
1240 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001241 }
1242 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001243 return 0;
1244}
1245
1246static void
1247copy_characters(PyObject *to, Py_ssize_t to_start,
1248 PyObject *from, Py_ssize_t from_start,
1249 Py_ssize_t how_many)
1250{
1251 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1252}
1253
1254Py_ssize_t
1255PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1256 PyObject *from, Py_ssize_t from_start,
1257 Py_ssize_t how_many)
1258{
1259 int err;
1260
1261 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1262 PyErr_BadInternalCall();
1263 return -1;
1264 }
1265
1266 if (PyUnicode_READY(from))
1267 return -1;
1268 if (PyUnicode_READY(to))
1269 return -1;
1270
1271 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1272 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1273 PyErr_Format(PyExc_SystemError,
1274 "Cannot write %zi characters at %zi "
1275 "in a string of %zi characters",
1276 how_many, to_start, PyUnicode_GET_LENGTH(to));
1277 return -1;
1278 }
1279
1280 if (how_many == 0)
1281 return 0;
1282
Victor Stinner488fa492011-12-12 00:01:39 +01001283 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001284 return -1;
1285
1286 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1287 if (err) {
1288 PyErr_Format(PyExc_SystemError,
1289 "Cannot copy %s characters "
1290 "into a string of %s characters",
1291 unicode_kind_name(from),
1292 unicode_kind_name(to));
1293 return -1;
1294 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001295 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001296}
1297
Victor Stinner17222162011-09-28 22:15:37 +02001298/* Find the maximum code point and count the number of surrogate pairs so a
1299 correct string length can be computed before converting a string to UCS4.
1300 This function counts single surrogates as a character and not as a pair.
1301
1302 Return 0 on success, or -1 on error. */
1303static int
1304find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1305 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001306{
1307 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001308 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001309
Victor Stinnerc53be962011-10-02 21:33:54 +02001310 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001311 *num_surrogates = 0;
1312 *maxchar = 0;
1313
1314 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001315#if SIZEOF_WCHAR_T == 2
Victor Stinnerca4f2072011-11-22 03:38:40 +01001316 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1317 && (iter+1) < end
1318 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001319 {
Victor Stinner8faf8212011-12-08 22:14:11 +01001320 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001321 ++(*num_surrogates);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001322 iter += 2;
1323 }
1324 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001325#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001326 {
1327 ch = *iter;
1328 iter++;
1329 }
1330 if (ch > *maxchar) {
1331 *maxchar = ch;
1332 if (*maxchar > MAX_UNICODE) {
1333 PyErr_Format(PyExc_ValueError,
1334 "character U+%x is not in range [U+0000; U+10ffff]",
1335 ch);
1336 return -1;
1337 }
1338 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001339 }
1340 return 0;
1341}
1342
1343#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001344static int unicode_ready_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001345#endif
1346
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001347int
1348_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001349{
1350 wchar_t *end;
1351 Py_UCS4 maxchar = 0;
1352 Py_ssize_t num_surrogates;
1353#if SIZEOF_WCHAR_T == 2
1354 Py_ssize_t length_wo_surrogates;
1355#endif
1356
Georg Brandl7597add2011-10-05 16:36:47 +02001357 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001358 strings were created using _PyObject_New() and where no canonical
1359 representation (the str field) has been set yet aka strings
1360 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001361 assert(_PyUnicode_CHECK(unicode));
1362 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001363 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001364 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001365 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001366 /* Actually, it should neither be interned nor be anything else: */
1367 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001368
1369#ifdef Py_DEBUG
1370 ++unicode_ready_calls;
1371#endif
1372
1373 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001374 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001375 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001376 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001377
1378 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001379 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1380 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001381 PyErr_NoMemory();
1382 return -1;
1383 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001384 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001385 _PyUnicode_WSTR(unicode), end,
1386 PyUnicode_1BYTE_DATA(unicode));
1387 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1388 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1389 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1390 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001391 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001392 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001393 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001394 }
1395 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001396 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001397 _PyUnicode_UTF8(unicode) = NULL;
1398 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001399 }
1400 PyObject_FREE(_PyUnicode_WSTR(unicode));
1401 _PyUnicode_WSTR(unicode) = NULL;
1402 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1403 }
1404 /* In this case we might have to convert down from 4-byte native
1405 wchar_t to 2-byte unicode. */
1406 else if (maxchar < 65536) {
1407 assert(num_surrogates == 0 &&
1408 "FindMaxCharAndNumSurrogatePairs() messed up");
1409
Victor Stinner506f5922011-09-28 22:34:18 +02001410#if SIZEOF_WCHAR_T == 2
1411 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001412 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001413 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1414 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1415 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001416 _PyUnicode_UTF8(unicode) = NULL;
1417 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001418#else
1419 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001420 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001421 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001422 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001423 PyErr_NoMemory();
1424 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001425 }
Victor Stinner506f5922011-09-28 22:34:18 +02001426 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1427 _PyUnicode_WSTR(unicode), end,
1428 PyUnicode_2BYTE_DATA(unicode));
1429 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1430 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1431 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001432 _PyUnicode_UTF8(unicode) = NULL;
1433 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001434 PyObject_FREE(_PyUnicode_WSTR(unicode));
1435 _PyUnicode_WSTR(unicode) = NULL;
1436 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1437#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001438 }
1439 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1440 else {
1441#if SIZEOF_WCHAR_T == 2
1442 /* in case the native representation is 2-bytes, we need to allocate a
1443 new normalized 4-byte version. */
1444 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001445 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1446 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001447 PyErr_NoMemory();
1448 return -1;
1449 }
1450 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1451 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001452 _PyUnicode_UTF8(unicode) = NULL;
1453 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001454 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1455 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001456 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001457 PyObject_FREE(_PyUnicode_WSTR(unicode));
1458 _PyUnicode_WSTR(unicode) = NULL;
1459 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1460#else
1461 assert(num_surrogates == 0);
1462
Victor Stinnerc3c74152011-10-02 20:39:55 +02001463 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001464 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001465 _PyUnicode_UTF8(unicode) = NULL;
1466 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001467 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1468#endif
1469 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1470 }
1471 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001472 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001473 return 0;
1474}
1475
Alexander Belopolsky40018472011-02-26 01:02:56 +00001476static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001477unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001478{
Walter Dörwald16807132007-05-25 13:52:07 +00001479 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001480 case SSTATE_NOT_INTERNED:
1481 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001482
Benjamin Peterson29060642009-01-31 22:14:21 +00001483 case SSTATE_INTERNED_MORTAL:
1484 /* revive dead object temporarily for DelItem */
1485 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001486 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001487 Py_FatalError(
1488 "deletion of interned string failed");
1489 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001490
Benjamin Peterson29060642009-01-31 22:14:21 +00001491 case SSTATE_INTERNED_IMMORTAL:
1492 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001493
Benjamin Peterson29060642009-01-31 22:14:21 +00001494 default:
1495 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001496 }
1497
Victor Stinner03490912011-10-03 23:45:12 +02001498 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001499 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001500 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001501 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001502 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1503 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001504
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001505 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001506}
1507
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001508#ifdef Py_DEBUG
1509static int
1510unicode_is_singleton(PyObject *unicode)
1511{
1512 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1513 if (unicode == unicode_empty)
1514 return 1;
1515 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1516 {
1517 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1518 if (ch < 256 && unicode_latin1[ch] == unicode)
1519 return 1;
1520 }
1521 return 0;
1522}
1523#endif
1524
Alexander Belopolsky40018472011-02-26 01:02:56 +00001525static int
Victor Stinner488fa492011-12-12 00:01:39 +01001526unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001527{
Victor Stinner488fa492011-12-12 00:01:39 +01001528 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001529 if (Py_REFCNT(unicode) != 1)
1530 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001531 if (_PyUnicode_HASH(unicode) != -1)
1532 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001533 if (PyUnicode_CHECK_INTERNED(unicode))
1534 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001535 if (!PyUnicode_CheckExact(unicode))
1536 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001537#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001538 /* singleton refcount is greater than 1 */
1539 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001540#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001541 return 1;
1542}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001543
Victor Stinnerfe226c02011-10-03 03:52:20 +02001544static int
1545unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1546{
1547 PyObject *unicode;
1548 Py_ssize_t old_length;
1549
1550 assert(p_unicode != NULL);
1551 unicode = *p_unicode;
1552
1553 assert(unicode != NULL);
1554 assert(PyUnicode_Check(unicode));
1555 assert(0 <= length);
1556
Victor Stinner910337b2011-10-03 03:20:16 +02001557 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001558 old_length = PyUnicode_WSTR_LENGTH(unicode);
1559 else
1560 old_length = PyUnicode_GET_LENGTH(unicode);
1561 if (old_length == length)
1562 return 0;
1563
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001564 if (length == 0) {
1565 Py_DECREF(*p_unicode);
1566 *p_unicode = unicode_empty;
1567 Py_INCREF(*p_unicode);
1568 return 0;
1569 }
1570
Victor Stinner488fa492011-12-12 00:01:39 +01001571 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001572 PyObject *copy = resize_copy(unicode, length);
1573 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001574 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001575 Py_DECREF(*p_unicode);
1576 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001577 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001578 }
1579
Victor Stinnerfe226c02011-10-03 03:52:20 +02001580 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001581 PyObject *new_unicode = resize_compact(unicode, length);
1582 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001583 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001584 *p_unicode = new_unicode;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001585 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001586 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001587 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001588 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001589}
1590
Alexander Belopolsky40018472011-02-26 01:02:56 +00001591int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001592PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001593{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001594 PyObject *unicode;
1595 if (p_unicode == NULL) {
1596 PyErr_BadInternalCall();
1597 return -1;
1598 }
1599 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001600 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001601 {
1602 PyErr_BadInternalCall();
1603 return -1;
1604 }
1605 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001606}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001607
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001608static int
Victor Stinner0a045ef2011-11-09 00:02:42 +01001609unicode_widen(PyObject **p_unicode, unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001610{
1611 PyObject *result;
1612 assert(PyUnicode_IS_READY(*p_unicode));
1613 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1614 return 0;
1615 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1616 maxchar);
1617 if (result == NULL)
1618 return -1;
1619 PyUnicode_CopyCharacters(result, 0, *p_unicode, 0,
1620 PyUnicode_GET_LENGTH(*p_unicode));
1621 Py_DECREF(*p_unicode);
1622 *p_unicode = result;
1623 return 0;
1624}
1625
1626static int
1627unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1628 Py_UCS4 ch)
1629{
1630 if (unicode_widen(p_unicode, ch) < 0)
1631 return -1;
1632 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1633 PyUnicode_DATA(*p_unicode),
1634 (*pos)++, ch);
1635 return 0;
1636}
1637
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001638static PyObject*
1639get_latin1_char(unsigned char ch)
1640{
Victor Stinnera464fc12011-10-02 20:39:30 +02001641 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001642 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001643 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001644 if (!unicode)
1645 return NULL;
1646 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001647 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001648 unicode_latin1[ch] = unicode;
1649 }
1650 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001651 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001652}
1653
Alexander Belopolsky40018472011-02-26 01:02:56 +00001654PyObject *
1655PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001656{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001657 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001658 Py_UCS4 maxchar = 0;
1659 Py_ssize_t num_surrogates;
1660
1661 if (u == NULL)
1662 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001663
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001664 /* If the Unicode data is known at construction time, we can apply
1665 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001666
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001667 /* Optimization for empty strings */
1668 if (size == 0 && unicode_empty != NULL) {
1669 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001670 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001671 }
Tim Petersced69f82003-09-16 20:30:58 +00001672
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001673 /* Single character Unicode objects in the Latin-1 range are
1674 shared when using this constructor */
1675 if (size == 1 && *u < 256)
1676 return get_latin1_char((unsigned char)*u);
1677
1678 /* If not empty and not single character, copy the Unicode data
1679 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001680 if (find_maxchar_surrogates(u, u + size,
1681 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001682 return NULL;
1683
Victor Stinner8faf8212011-12-08 22:14:11 +01001684 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001685 if (!unicode)
1686 return NULL;
1687
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001688 switch (PyUnicode_KIND(unicode)) {
1689 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001690 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001691 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1692 break;
1693 case PyUnicode_2BYTE_KIND:
1694#if Py_UNICODE_SIZE == 2
1695 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1696#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001697 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001698 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1699#endif
1700 break;
1701 case PyUnicode_4BYTE_KIND:
1702#if SIZEOF_WCHAR_T == 2
1703 /* This is the only case which has to process surrogates, thus
1704 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001705 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001706#else
1707 assert(num_surrogates == 0);
1708 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1709#endif
1710 break;
1711 default:
1712 assert(0 && "Impossible state");
1713 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001714
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001715 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001716}
1717
Alexander Belopolsky40018472011-02-26 01:02:56 +00001718PyObject *
1719PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001720{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001721 if (size < 0) {
1722 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001723 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001724 return NULL;
1725 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001726 if (u != NULL)
1727 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1728 else
1729 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001730}
1731
Alexander Belopolsky40018472011-02-26 01:02:56 +00001732PyObject *
1733PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001734{
1735 size_t size = strlen(u);
1736 if (size > PY_SSIZE_T_MAX) {
1737 PyErr_SetString(PyExc_OverflowError, "input too long");
1738 return NULL;
1739 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001740 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001741}
1742
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001743PyObject *
1744_PyUnicode_FromId(_Py_Identifier *id)
1745{
1746 if (!id->object) {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001747 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1748 strlen(id->string),
1749 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001750 if (!id->object)
1751 return NULL;
1752 PyUnicode_InternInPlace(&id->object);
1753 assert(!id->next);
1754 id->next = static_strings;
1755 static_strings = id;
1756 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001757 return id->object;
1758}
1759
1760void
1761_PyUnicode_ClearStaticStrings()
1762{
1763 _Py_Identifier *i;
1764 for (i = static_strings; i; i = i->next) {
1765 Py_DECREF(i->object);
1766 i->object = NULL;
1767 i->next = NULL;
1768 }
1769}
1770
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001771/* Internal function, don't check maximum character */
1772
Victor Stinnere57b1c02011-09-28 22:20:48 +02001773static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001774unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001775{
Victor Stinner785938e2011-12-11 20:09:03 +01001776 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001777 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001778#ifdef Py_DEBUG
Victor Stinnere6b2d442011-12-11 21:54:30 +01001779 assert(s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001780#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001781 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001782 }
Victor Stinner785938e2011-12-11 20:09:03 +01001783 unicode = PyUnicode_New(size, 127);
1784 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001785 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001786 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1787 assert(_PyUnicode_CheckConsistency(unicode, 1));
1788 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001789}
1790
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001791static Py_UCS4
1792kind_maxchar_limit(unsigned int kind)
1793{
1794 switch(kind) {
1795 case PyUnicode_1BYTE_KIND:
1796 return 0x80;
1797 case PyUnicode_2BYTE_KIND:
1798 return 0x100;
1799 case PyUnicode_4BYTE_KIND:
1800 return 0x10000;
1801 default:
1802 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001803 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001804 }
1805}
1806
Victor Stinner702c7342011-10-05 13:50:52 +02001807static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001808_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001809{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001810 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001811 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001812
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001813 if (size == 0) {
1814 Py_INCREF(unicode_empty);
1815 return unicode_empty;
1816 }
1817 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001818 if (size == 1)
1819 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001820
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001821 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001822 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001823 if (!res)
1824 return NULL;
1825 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001826 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001827 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001828}
1829
Victor Stinnere57b1c02011-09-28 22:20:48 +02001830static PyObject*
1831_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001832{
1833 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001834 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001835
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001836 if (size == 0) {
1837 Py_INCREF(unicode_empty);
1838 return unicode_empty;
1839 }
1840 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001841 if (size == 1 && u[0] < 256)
Victor Stinner4e101002011-10-11 23:27:52 +02001842 return get_latin1_char((unsigned char)u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001843
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001844 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001845 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001846 if (!res)
1847 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001848 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001849 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001850 else {
1851 _PyUnicode_CONVERT_BYTES(
1852 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1853 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001854 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001855 return res;
1856}
1857
Victor Stinnere57b1c02011-09-28 22:20:48 +02001858static PyObject*
1859_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001860{
1861 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001862 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001863
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001864 if (size == 0) {
1865 Py_INCREF(unicode_empty);
1866 return unicode_empty;
1867 }
1868 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001869 if (size == 1 && u[0] < 256)
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001870 return get_latin1_char((unsigned char)u[0]);
1871
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001872 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001873 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001874 if (!res)
1875 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001876 if (max_char < 256)
1877 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1878 PyUnicode_1BYTE_DATA(res));
1879 else if (max_char < 0x10000)
1880 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1881 PyUnicode_2BYTE_DATA(res));
1882 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001883 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001884 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001885 return res;
1886}
1887
1888PyObject*
1889PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1890{
Victor Stinnercfed46e2011-11-22 01:29:14 +01001891 if (size < 0) {
1892 PyErr_SetString(PyExc_ValueError, "size must be positive");
1893 return NULL;
1894 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001895 switch(kind) {
1896 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001897 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001898 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001899 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001900 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001901 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001902 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02001903 PyErr_SetString(PyExc_SystemError, "invalid kind");
1904 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001905 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001906}
1907
Victor Stinner25a4b292011-10-06 12:31:55 +02001908/* Ensure that a string uses the most efficient storage, if it is not the
1909 case: create a new string with of the right kind. Write NULL into *p_unicode
1910 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001911static void
Victor Stinner25a4b292011-10-06 12:31:55 +02001912unicode_adjust_maxchar(PyObject **p_unicode)
1913{
1914 PyObject *unicode, *copy;
1915 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001916 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02001917 unsigned int kind;
1918
1919 assert(p_unicode != NULL);
1920 unicode = *p_unicode;
1921 assert(PyUnicode_IS_READY(unicode));
1922 if (PyUnicode_IS_ASCII(unicode))
1923 return;
1924
1925 len = PyUnicode_GET_LENGTH(unicode);
1926 kind = PyUnicode_KIND(unicode);
1927 if (kind == PyUnicode_1BYTE_KIND) {
1928 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001929 max_char = ucs1lib_find_max_char(u, u + len);
1930 if (max_char >= 128)
1931 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001932 }
1933 else if (kind == PyUnicode_2BYTE_KIND) {
1934 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001935 max_char = ucs2lib_find_max_char(u, u + len);
1936 if (max_char >= 256)
1937 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001938 }
1939 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001940 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02001941 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001942 max_char = ucs4lib_find_max_char(u, u + len);
1943 if (max_char >= 0x10000)
1944 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001945 }
Victor Stinner25a4b292011-10-06 12:31:55 +02001946 copy = PyUnicode_New(len, max_char);
1947 copy_characters(copy, 0, unicode, 0, len);
1948 Py_DECREF(unicode);
1949 *p_unicode = copy;
1950}
1951
Victor Stinner034f6cf2011-09-30 02:26:44 +02001952PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01001953_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02001954{
Victor Stinner87af4f22011-11-21 23:03:47 +01001955 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001956 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001957
Victor Stinner034f6cf2011-09-30 02:26:44 +02001958 if (!PyUnicode_Check(unicode)) {
1959 PyErr_BadInternalCall();
1960 return NULL;
1961 }
1962 if (PyUnicode_READY(unicode))
1963 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001964
Victor Stinner87af4f22011-11-21 23:03:47 +01001965 length = PyUnicode_GET_LENGTH(unicode);
1966 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001967 if (!copy)
1968 return NULL;
1969 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1970
Victor Stinner87af4f22011-11-21 23:03:47 +01001971 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
1972 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001973 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001974 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001975}
1976
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001977
Victor Stinnerbc603d12011-10-02 01:00:40 +02001978/* Widen Unicode objects to larger buffers. Don't write terminating null
1979 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001980
1981void*
1982_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1983{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001984 Py_ssize_t len;
1985 void *result;
1986 unsigned int skind;
1987
1988 if (PyUnicode_READY(s))
1989 return NULL;
1990
1991 len = PyUnicode_GET_LENGTH(s);
1992 skind = PyUnicode_KIND(s);
1993 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02001994 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001995 return NULL;
1996 }
1997 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001998 case PyUnicode_2BYTE_KIND:
1999 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2000 if (!result)
2001 return PyErr_NoMemory();
2002 assert(skind == PyUnicode_1BYTE_KIND);
2003 _PyUnicode_CONVERT_BYTES(
2004 Py_UCS1, Py_UCS2,
2005 PyUnicode_1BYTE_DATA(s),
2006 PyUnicode_1BYTE_DATA(s) + len,
2007 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002008 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002009 case PyUnicode_4BYTE_KIND:
2010 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2011 if (!result)
2012 return PyErr_NoMemory();
2013 if (skind == PyUnicode_2BYTE_KIND) {
2014 _PyUnicode_CONVERT_BYTES(
2015 Py_UCS2, Py_UCS4,
2016 PyUnicode_2BYTE_DATA(s),
2017 PyUnicode_2BYTE_DATA(s) + len,
2018 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002019 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002020 else {
2021 assert(skind == PyUnicode_1BYTE_KIND);
2022 _PyUnicode_CONVERT_BYTES(
2023 Py_UCS1, Py_UCS4,
2024 PyUnicode_1BYTE_DATA(s),
2025 PyUnicode_1BYTE_DATA(s) + len,
2026 result);
2027 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002028 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002029 default:
2030 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002031 }
Victor Stinner01698042011-10-04 00:04:26 +02002032 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002033 return NULL;
2034}
2035
2036static Py_UCS4*
2037as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2038 int copy_null)
2039{
2040 int kind;
2041 void *data;
2042 Py_ssize_t len, targetlen;
2043 if (PyUnicode_READY(string) == -1)
2044 return NULL;
2045 kind = PyUnicode_KIND(string);
2046 data = PyUnicode_DATA(string);
2047 len = PyUnicode_GET_LENGTH(string);
2048 targetlen = len;
2049 if (copy_null)
2050 targetlen++;
2051 if (!target) {
2052 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2053 PyErr_NoMemory();
2054 return NULL;
2055 }
2056 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2057 if (!target) {
2058 PyErr_NoMemory();
2059 return NULL;
2060 }
2061 }
2062 else {
2063 if (targetsize < targetlen) {
2064 PyErr_Format(PyExc_SystemError,
2065 "string is longer than the buffer");
2066 if (copy_null && 0 < targetsize)
2067 target[0] = 0;
2068 return NULL;
2069 }
2070 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002071 if (kind == PyUnicode_1BYTE_KIND) {
2072 Py_UCS1 *start = (Py_UCS1 *) data;
2073 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002074 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002075 else if (kind == PyUnicode_2BYTE_KIND) {
2076 Py_UCS2 *start = (Py_UCS2 *) data;
2077 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2078 }
2079 else {
2080 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002081 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002082 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002083 if (copy_null)
2084 target[len] = 0;
2085 return target;
2086}
2087
2088Py_UCS4*
2089PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2090 int copy_null)
2091{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002092 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002093 PyErr_BadInternalCall();
2094 return NULL;
2095 }
2096 return as_ucs4(string, target, targetsize, copy_null);
2097}
2098
2099Py_UCS4*
2100PyUnicode_AsUCS4Copy(PyObject *string)
2101{
2102 return as_ucs4(string, NULL, 0, 1);
2103}
2104
2105#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002106
Alexander Belopolsky40018472011-02-26 01:02:56 +00002107PyObject *
2108PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002109{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002110 if (w == NULL) {
Victor Stinner382955f2011-12-11 21:44:00 +01002111 if (size == 0) {
2112 Py_INCREF(unicode_empty);
2113 return unicode_empty;
2114 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002115 PyErr_BadInternalCall();
2116 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002117 }
2118
Martin v. Löwis790465f2008-04-05 20:41:37 +00002119 if (size == -1) {
2120 size = wcslen(w);
2121 }
2122
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002123 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002124}
2125
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002126#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002127
Walter Dörwald346737f2007-05-31 10:44:43 +00002128static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002129makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2130 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002131{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002132 *fmt++ = '%';
2133 if (width) {
2134 if (zeropad)
2135 *fmt++ = '0';
2136 fmt += sprintf(fmt, "%d", width);
2137 }
2138 if (precision)
2139 fmt += sprintf(fmt, ".%d", precision);
2140 if (longflag)
2141 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002142 else if (longlongflag) {
2143 /* longlongflag should only ever be nonzero on machines with
2144 HAVE_LONG_LONG defined */
2145#ifdef HAVE_LONG_LONG
2146 char *f = PY_FORMAT_LONG_LONG;
2147 while (*f)
2148 *fmt++ = *f++;
2149#else
2150 /* we shouldn't ever get here */
2151 assert(0);
2152 *fmt++ = 'l';
2153#endif
2154 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002155 else if (size_tflag) {
2156 char *f = PY_FORMAT_SIZE_T;
2157 while (*f)
2158 *fmt++ = *f++;
2159 }
2160 *fmt++ = c;
2161 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002162}
2163
Victor Stinner96865452011-03-01 23:44:09 +00002164/* helper for PyUnicode_FromFormatV() */
2165
2166static const char*
2167parse_format_flags(const char *f,
2168 int *p_width, int *p_precision,
2169 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2170{
2171 int width, precision, longflag, longlongflag, size_tflag;
2172
2173 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2174 f++;
2175 width = 0;
2176 while (Py_ISDIGIT((unsigned)*f))
2177 width = (width*10) + *f++ - '0';
2178 precision = 0;
2179 if (*f == '.') {
2180 f++;
2181 while (Py_ISDIGIT((unsigned)*f))
2182 precision = (precision*10) + *f++ - '0';
2183 if (*f == '%') {
2184 /* "%.3%s" => f points to "3" */
2185 f--;
2186 }
2187 }
2188 if (*f == '\0') {
2189 /* bogus format "%.1" => go backward, f points to "1" */
2190 f--;
2191 }
2192 if (p_width != NULL)
2193 *p_width = width;
2194 if (p_precision != NULL)
2195 *p_precision = precision;
2196
2197 /* Handle %ld, %lu, %lld and %llu. */
2198 longflag = 0;
2199 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002200 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002201
2202 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002203 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002204 longflag = 1;
2205 ++f;
2206 }
2207#ifdef HAVE_LONG_LONG
2208 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002209 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002210 longlongflag = 1;
2211 f += 2;
2212 }
2213#endif
2214 }
2215 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002216 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002217 size_tflag = 1;
2218 ++f;
2219 }
2220 if (p_longflag != NULL)
2221 *p_longflag = longflag;
2222 if (p_longlongflag != NULL)
2223 *p_longlongflag = longlongflag;
2224 if (p_size_tflag != NULL)
2225 *p_size_tflag = size_tflag;
2226 return f;
2227}
2228
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002229/* maximum number of characters required for output of %ld. 21 characters
2230 allows for 64-bit integers (in decimal) and an optional sign. */
2231#define MAX_LONG_CHARS 21
2232/* maximum number of characters required for output of %lld.
2233 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2234 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2235#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2236
Walter Dörwaldd2034312007-05-18 16:29:38 +00002237PyObject *
2238PyUnicode_FromFormatV(const char *format, va_list vargs)
2239{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002240 va_list count;
2241 Py_ssize_t callcount = 0;
2242 PyObject **callresults = NULL;
2243 PyObject **callresult = NULL;
2244 Py_ssize_t n = 0;
2245 int width = 0;
2246 int precision = 0;
2247 int zeropad;
2248 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002249 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002250 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002251 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002252 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2253 Py_UCS4 argmaxchar;
2254 Py_ssize_t numbersize = 0;
2255 char *numberresults = NULL;
2256 char *numberresult = NULL;
2257 Py_ssize_t i;
2258 int kind;
2259 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002260
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002261 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002262 /* step 1: count the number of %S/%R/%A/%s format specifications
2263 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2264 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002265 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002266 * also estimate a upper bound for all the number formats in the string,
2267 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002268 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002269 for (f = format; *f; f++) {
2270 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002271 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002272 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2273 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2274 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2275 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002276
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002277 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002278#ifdef HAVE_LONG_LONG
2279 if (longlongflag) {
2280 if (width < MAX_LONG_LONG_CHARS)
2281 width = MAX_LONG_LONG_CHARS;
2282 }
2283 else
2284#endif
2285 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2286 including sign. Decimal takes the most space. This
2287 isn't enough for octal. If a width is specified we
2288 need more (which we allocate later). */
2289 if (width < MAX_LONG_CHARS)
2290 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002291
2292 /* account for the size + '\0' to separate numbers
2293 inside of the numberresults buffer */
2294 numbersize += (width + 1);
2295 }
2296 }
2297 else if ((unsigned char)*f > 127) {
2298 PyErr_Format(PyExc_ValueError,
2299 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2300 "string, got a non-ASCII byte: 0x%02x",
2301 (unsigned char)*f);
2302 return NULL;
2303 }
2304 }
2305 /* step 2: allocate memory for the results of
2306 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2307 if (callcount) {
2308 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2309 if (!callresults) {
2310 PyErr_NoMemory();
2311 return NULL;
2312 }
2313 callresult = callresults;
2314 }
2315 /* step 2.5: allocate memory for the results of formating numbers */
2316 if (numbersize) {
2317 numberresults = PyObject_Malloc(numbersize);
2318 if (!numberresults) {
2319 PyErr_NoMemory();
2320 goto fail;
2321 }
2322 numberresult = numberresults;
2323 }
2324
2325 /* step 3: format numbers and figure out how large a buffer we need */
2326 for (f = format; *f; f++) {
2327 if (*f == '%') {
2328 const char* p;
2329 int longflag;
2330 int longlongflag;
2331 int size_tflag;
2332 int numprinted;
2333
2334 p = f;
2335 zeropad = (f[1] == '0');
2336 f = parse_format_flags(f, &width, &precision,
2337 &longflag, &longlongflag, &size_tflag);
2338 switch (*f) {
2339 case 'c':
2340 {
2341 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002342 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002343 n++;
2344 break;
2345 }
2346 case '%':
2347 n++;
2348 break;
2349 case 'i':
2350 case 'd':
2351 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2352 width, precision, *f);
2353 if (longflag)
2354 numprinted = sprintf(numberresult, fmt,
2355 va_arg(count, long));
2356#ifdef HAVE_LONG_LONG
2357 else if (longlongflag)
2358 numprinted = sprintf(numberresult, fmt,
2359 va_arg(count, PY_LONG_LONG));
2360#endif
2361 else if (size_tflag)
2362 numprinted = sprintf(numberresult, fmt,
2363 va_arg(count, Py_ssize_t));
2364 else
2365 numprinted = sprintf(numberresult, fmt,
2366 va_arg(count, int));
2367 n += numprinted;
2368 /* advance by +1 to skip over the '\0' */
2369 numberresult += (numprinted + 1);
2370 assert(*(numberresult - 1) == '\0');
2371 assert(*(numberresult - 2) != '\0');
2372 assert(numprinted >= 0);
2373 assert(numberresult <= numberresults + numbersize);
2374 break;
2375 case 'u':
2376 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2377 width, precision, 'u');
2378 if (longflag)
2379 numprinted = sprintf(numberresult, fmt,
2380 va_arg(count, unsigned long));
2381#ifdef HAVE_LONG_LONG
2382 else if (longlongflag)
2383 numprinted = sprintf(numberresult, fmt,
2384 va_arg(count, unsigned PY_LONG_LONG));
2385#endif
2386 else if (size_tflag)
2387 numprinted = sprintf(numberresult, fmt,
2388 va_arg(count, size_t));
2389 else
2390 numprinted = sprintf(numberresult, fmt,
2391 va_arg(count, unsigned int));
2392 n += numprinted;
2393 numberresult += (numprinted + 1);
2394 assert(*(numberresult - 1) == '\0');
2395 assert(*(numberresult - 2) != '\0');
2396 assert(numprinted >= 0);
2397 assert(numberresult <= numberresults + numbersize);
2398 break;
2399 case 'x':
2400 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2401 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2402 n += numprinted;
2403 numberresult += (numprinted + 1);
2404 assert(*(numberresult - 1) == '\0');
2405 assert(*(numberresult - 2) != '\0');
2406 assert(numprinted >= 0);
2407 assert(numberresult <= numberresults + numbersize);
2408 break;
2409 case 'p':
2410 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2411 /* %p is ill-defined: ensure leading 0x. */
2412 if (numberresult[1] == 'X')
2413 numberresult[1] = 'x';
2414 else if (numberresult[1] != 'x') {
2415 memmove(numberresult + 2, numberresult,
2416 strlen(numberresult) + 1);
2417 numberresult[0] = '0';
2418 numberresult[1] = 'x';
2419 numprinted += 2;
2420 }
2421 n += numprinted;
2422 numberresult += (numprinted + 1);
2423 assert(*(numberresult - 1) == '\0');
2424 assert(*(numberresult - 2) != '\0');
2425 assert(numprinted >= 0);
2426 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002427 break;
2428 case 's':
2429 {
2430 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002431 const char *s = va_arg(count, const char*);
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002432 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002433 if (!str)
2434 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002435 /* since PyUnicode_DecodeUTF8 returns already flexible
2436 unicode objects, there is no need to call ready on them */
2437 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002438 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002439 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002440 /* Remember the str and switch to the next slot */
2441 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002442 break;
2443 }
2444 case 'U':
2445 {
2446 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002447 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002448 if (PyUnicode_READY(obj) == -1)
2449 goto fail;
2450 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002451 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002452 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002453 break;
2454 }
2455 case 'V':
2456 {
2457 PyObject *obj = va_arg(count, PyObject *);
2458 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002459 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002460 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002461 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002462 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002463 if (PyUnicode_READY(obj) == -1)
2464 goto fail;
2465 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002466 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002467 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002468 *callresult++ = NULL;
2469 }
2470 else {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002471 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002472 if (!str_obj)
2473 goto fail;
Victor Stinnere1335c72011-10-04 20:53:03 +02002474 if (PyUnicode_READY(str_obj)) {
2475 Py_DECREF(str_obj);
2476 goto fail;
2477 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002478 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002479 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002480 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002481 *callresult++ = str_obj;
2482 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002483 break;
2484 }
2485 case 'S':
2486 {
2487 PyObject *obj = va_arg(count, PyObject *);
2488 PyObject *str;
2489 assert(obj);
2490 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002491 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002492 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002493 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002494 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002495 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002496 /* Remember the str and switch to the next slot */
2497 *callresult++ = str;
2498 break;
2499 }
2500 case 'R':
2501 {
2502 PyObject *obj = va_arg(count, PyObject *);
2503 PyObject *repr;
2504 assert(obj);
2505 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002506 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002507 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002508 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002509 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002510 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002511 /* Remember the repr and switch to the next slot */
2512 *callresult++ = repr;
2513 break;
2514 }
2515 case 'A':
2516 {
2517 PyObject *obj = va_arg(count, PyObject *);
2518 PyObject *ascii;
2519 assert(obj);
2520 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002521 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002522 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002523 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002524 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002525 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002526 /* Remember the repr and switch to the next slot */
2527 *callresult++ = ascii;
2528 break;
2529 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002530 default:
2531 /* if we stumble upon an unknown
2532 formatting code, copy the rest of
2533 the format string to the output
2534 string. (we cannot just skip the
2535 code, since there's no way to know
2536 what's in the argument list) */
2537 n += strlen(p);
2538 goto expand;
2539 }
2540 } else
2541 n++;
2542 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002543 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002544 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002545 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002546 we don't have to resize the string.
2547 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002548 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002549 if (!string)
2550 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002551 kind = PyUnicode_KIND(string);
2552 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002553 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002554 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002555
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002556 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002557 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002558 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002559
2560 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002561 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2562 /* checking for == because the last argument could be a empty
2563 string, which causes i to point to end, the assert at the end of
2564 the loop */
2565 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002566
Benjamin Peterson14339b62009-01-31 16:36:08 +00002567 switch (*f) {
2568 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002569 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002570 const int ordinal = va_arg(vargs, int);
2571 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002572 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002573 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002574 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002575 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002576 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002577 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002578 case 'p':
2579 /* unused, since we already have the result */
2580 if (*f == 'p')
2581 (void) va_arg(vargs, void *);
2582 else
2583 (void) va_arg(vargs, int);
2584 /* extract the result from numberresults and append. */
2585 for (; *numberresult; ++i, ++numberresult)
2586 PyUnicode_WRITE(kind, data, i, *numberresult);
2587 /* skip over the separating '\0' */
2588 assert(*numberresult == '\0');
2589 numberresult++;
2590 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002591 break;
2592 case 's':
2593 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002594 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002595 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002596 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002597 size = PyUnicode_GET_LENGTH(*callresult);
2598 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002599 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002600 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002601 /* We're done with the unicode()/repr() => forget it */
2602 Py_DECREF(*callresult);
2603 /* switch to next unicode()/repr() result */
2604 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002605 break;
2606 }
2607 case 'U':
2608 {
2609 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002610 Py_ssize_t size;
2611 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2612 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002613 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002614 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002615 break;
2616 }
2617 case 'V':
2618 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002619 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002620 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002621 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002622 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002623 size = PyUnicode_GET_LENGTH(obj);
2624 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002625 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002626 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002627 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002628 size = PyUnicode_GET_LENGTH(*callresult);
2629 assert(PyUnicode_KIND(*callresult) <=
2630 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002631 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002632 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002633 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002634 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002635 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002636 break;
2637 }
2638 case 'S':
2639 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002640 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002641 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002642 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002643 /* unused, since we already have the result */
2644 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002645 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002646 copy_characters(string, i, *callresult, 0, size);
2647 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002648 /* We're done with the unicode()/repr() => forget it */
2649 Py_DECREF(*callresult);
2650 /* switch to next unicode()/repr() result */
2651 ++callresult;
2652 break;
2653 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002654 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002655 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002656 break;
2657 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002658 for (; *p; ++p, ++i)
2659 PyUnicode_WRITE(kind, data, i, *p);
2660 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002661 goto end;
2662 }
Victor Stinner1205f272010-09-11 00:54:47 +00002663 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002664 else {
2665 assert(i < PyUnicode_GET_LENGTH(string));
2666 PyUnicode_WRITE(kind, data, i++, *f);
2667 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002668 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002669 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002670
Benjamin Peterson29060642009-01-31 22:14:21 +00002671 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002672 if (callresults)
2673 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002674 if (numberresults)
2675 PyObject_Free(numberresults);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002676 return unicode_result(string);
Benjamin Peterson29060642009-01-31 22:14:21 +00002677 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002678 if (callresults) {
2679 PyObject **callresult2 = callresults;
2680 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002681 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002682 ++callresult2;
2683 }
2684 PyObject_Free(callresults);
2685 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002686 if (numberresults)
2687 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002688 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002689}
2690
Walter Dörwaldd2034312007-05-18 16:29:38 +00002691PyObject *
2692PyUnicode_FromFormat(const char *format, ...)
2693{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002694 PyObject* ret;
2695 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002696
2697#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002698 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002699#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002700 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002701#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002702 ret = PyUnicode_FromFormatV(format, vargs);
2703 va_end(vargs);
2704 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002705}
2706
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002707#ifdef HAVE_WCHAR_H
2708
Victor Stinner5593d8a2010-10-02 11:11:27 +00002709/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2710 convert a Unicode object to a wide character string.
2711
Victor Stinnerd88d9832011-09-06 02:00:05 +02002712 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002713 character) required to convert the unicode object. Ignore size argument.
2714
Victor Stinnerd88d9832011-09-06 02:00:05 +02002715 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002716 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002717 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002718static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002719unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002720 wchar_t *w,
2721 Py_ssize_t size)
2722{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002723 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002724 const wchar_t *wstr;
2725
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002726 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002727 if (wstr == NULL)
2728 return -1;
2729
Victor Stinner5593d8a2010-10-02 11:11:27 +00002730 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002731 if (size > res)
2732 size = res + 1;
2733 else
2734 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002735 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002736 return res;
2737 }
2738 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002739 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002740}
2741
2742Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002743PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002744 wchar_t *w,
2745 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002746{
2747 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002748 PyErr_BadInternalCall();
2749 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002750 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002751 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002752}
2753
Victor Stinner137c34c2010-09-29 10:25:54 +00002754wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002755PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002756 Py_ssize_t *size)
2757{
2758 wchar_t* buffer;
2759 Py_ssize_t buflen;
2760
2761 if (unicode == NULL) {
2762 PyErr_BadInternalCall();
2763 return NULL;
2764 }
2765
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002766 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002767 if (buflen == -1)
2768 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002769 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002770 PyErr_NoMemory();
2771 return NULL;
2772 }
2773
Victor Stinner137c34c2010-09-29 10:25:54 +00002774 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2775 if (buffer == NULL) {
2776 PyErr_NoMemory();
2777 return NULL;
2778 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002779 buflen = unicode_aswidechar(unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002780 if (buflen == -1)
2781 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002782 if (size != NULL)
2783 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002784 return buffer;
2785}
2786
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002787#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002788
Alexander Belopolsky40018472011-02-26 01:02:56 +00002789PyObject *
2790PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002791{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002792 PyObject *v;
Victor Stinner8faf8212011-12-08 22:14:11 +01002793 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002794 PyErr_SetString(PyExc_ValueError,
2795 "chr() arg not in range(0x110000)");
2796 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002797 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002798
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002799 if (ordinal < 256)
2800 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002801
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002802 v = PyUnicode_New(1, ordinal);
2803 if (v == NULL)
2804 return NULL;
2805 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002806 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002807 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002808}
2809
Alexander Belopolsky40018472011-02-26 01:02:56 +00002810PyObject *
2811PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002812{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002813 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002814 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002815 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002816 if (PyUnicode_READY(obj))
2817 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002818 Py_INCREF(obj);
2819 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002820 }
2821 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002822 /* For a Unicode subtype that's not a Unicode object,
2823 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002824 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002825 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002826 PyErr_Format(PyExc_TypeError,
2827 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002828 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002829 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002830}
2831
Alexander Belopolsky40018472011-02-26 01:02:56 +00002832PyObject *
2833PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002834 const char *encoding,
2835 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002836{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002837 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002838 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002839
Guido van Rossumd57fd912000-03-10 22:53:23 +00002840 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002841 PyErr_BadInternalCall();
2842 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002843 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002844
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002845 /* Decoding bytes objects is the most common case and should be fast */
2846 if (PyBytes_Check(obj)) {
2847 if (PyBytes_GET_SIZE(obj) == 0) {
2848 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002849 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002850 }
2851 else {
2852 v = PyUnicode_Decode(
2853 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2854 encoding, errors);
2855 }
2856 return v;
2857 }
2858
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002859 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002860 PyErr_SetString(PyExc_TypeError,
2861 "decoding str is not supported");
2862 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002863 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002864
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002865 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2866 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2867 PyErr_Format(PyExc_TypeError,
2868 "coercing to str: need bytes, bytearray "
2869 "or buffer-like object, %.80s found",
2870 Py_TYPE(obj)->tp_name);
2871 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002872 }
Tim Petersced69f82003-09-16 20:30:58 +00002873
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002874 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002875 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002876 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002877 }
Tim Petersced69f82003-09-16 20:30:58 +00002878 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002879 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002880
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002881 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002882 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002883}
2884
Victor Stinner600d3be2010-06-10 12:00:55 +00002885/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002886 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2887 1 on success. */
2888static int
2889normalize_encoding(const char *encoding,
2890 char *lower,
2891 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002892{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002893 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002894 char *l;
2895 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002896
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002897 if (encoding == NULL) {
2898 strcpy(lower, "utf-8");
2899 return 1;
2900 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002901 e = encoding;
2902 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002903 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002904 while (*e) {
2905 if (l == l_end)
2906 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002907 if (Py_ISUPPER(*e)) {
2908 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002909 }
2910 else if (*e == '_') {
2911 *l++ = '-';
2912 e++;
2913 }
2914 else {
2915 *l++ = *e++;
2916 }
2917 }
2918 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002919 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002920}
2921
Alexander Belopolsky40018472011-02-26 01:02:56 +00002922PyObject *
2923PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002924 Py_ssize_t size,
2925 const char *encoding,
2926 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002927{
2928 PyObject *buffer = NULL, *unicode;
2929 Py_buffer info;
2930 char lower[11]; /* Enough for any encoding shortcut */
2931
Fred Drakee4315f52000-05-09 19:53:39 +00002932 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002933 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002934 if ((strcmp(lower, "utf-8") == 0) ||
2935 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002936 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00002937 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002938 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002939 (strcmp(lower, "iso-8859-1") == 0))
2940 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002941#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002942 else if (strcmp(lower, "mbcs") == 0)
2943 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002944#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002945 else if (strcmp(lower, "ascii") == 0)
2946 return PyUnicode_DecodeASCII(s, size, errors);
2947 else if (strcmp(lower, "utf-16") == 0)
2948 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2949 else if (strcmp(lower, "utf-32") == 0)
2950 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2951 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002952
2953 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002954 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002955 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002956 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002957 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002958 if (buffer == NULL)
2959 goto onError;
2960 unicode = PyCodec_Decode(buffer, encoding, errors);
2961 if (unicode == NULL)
2962 goto onError;
2963 if (!PyUnicode_Check(unicode)) {
2964 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002965 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002966 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002967 Py_DECREF(unicode);
2968 goto onError;
2969 }
2970 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002971 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00002972
Benjamin Peterson29060642009-01-31 22:14:21 +00002973 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002974 Py_XDECREF(buffer);
2975 return NULL;
2976}
2977
Alexander Belopolsky40018472011-02-26 01:02:56 +00002978PyObject *
2979PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002980 const char *encoding,
2981 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002982{
2983 PyObject *v;
2984
2985 if (!PyUnicode_Check(unicode)) {
2986 PyErr_BadArgument();
2987 goto onError;
2988 }
2989
2990 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002991 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002992
2993 /* Decode via the codec registry */
2994 v = PyCodec_Decode(unicode, encoding, errors);
2995 if (v == NULL)
2996 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002997 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002998
Benjamin Peterson29060642009-01-31 22:14:21 +00002999 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003000 return NULL;
3001}
3002
Alexander Belopolsky40018472011-02-26 01:02:56 +00003003PyObject *
3004PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003005 const char *encoding,
3006 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003007{
3008 PyObject *v;
3009
3010 if (!PyUnicode_Check(unicode)) {
3011 PyErr_BadArgument();
3012 goto onError;
3013 }
3014
3015 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003016 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003017
3018 /* Decode via the codec registry */
3019 v = PyCodec_Decode(unicode, encoding, errors);
3020 if (v == NULL)
3021 goto onError;
3022 if (!PyUnicode_Check(v)) {
3023 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003024 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003025 Py_TYPE(v)->tp_name);
3026 Py_DECREF(v);
3027 goto onError;
3028 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003029 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003030
Benjamin Peterson29060642009-01-31 22:14:21 +00003031 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003032 return NULL;
3033}
3034
Alexander Belopolsky40018472011-02-26 01:02:56 +00003035PyObject *
3036PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003037 Py_ssize_t size,
3038 const char *encoding,
3039 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003040{
3041 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003042
Guido van Rossumd57fd912000-03-10 22:53:23 +00003043 unicode = PyUnicode_FromUnicode(s, size);
3044 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003045 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003046 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3047 Py_DECREF(unicode);
3048 return v;
3049}
3050
Alexander Belopolsky40018472011-02-26 01:02:56 +00003051PyObject *
3052PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003053 const char *encoding,
3054 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003055{
3056 PyObject *v;
3057
3058 if (!PyUnicode_Check(unicode)) {
3059 PyErr_BadArgument();
3060 goto onError;
3061 }
3062
3063 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003064 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003065
3066 /* Encode via the codec registry */
3067 v = PyCodec_Encode(unicode, encoding, errors);
3068 if (v == NULL)
3069 goto onError;
3070 return v;
3071
Benjamin Peterson29060642009-01-31 22:14:21 +00003072 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003073 return NULL;
3074}
3075
Victor Stinnerad158722010-10-27 00:25:46 +00003076PyObject *
3077PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003078{
Victor Stinner99b95382011-07-04 14:23:54 +02003079#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003080 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003081#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003082 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003083#else
Victor Stinner793b5312011-04-27 00:24:21 +02003084 PyInterpreterState *interp = PyThreadState_GET()->interp;
3085 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3086 cannot use it to encode and decode filenames before it is loaded. Load
3087 the Python codec requires to encode at least its own filename. Use the C
3088 version of the locale codec until the codec registry is initialized and
3089 the Python codec is loaded.
3090
3091 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3092 cannot only rely on it: check also interp->fscodec_initialized for
3093 subinterpreters. */
3094 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003095 return PyUnicode_AsEncodedString(unicode,
3096 Py_FileSystemDefaultEncoding,
3097 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003098 }
3099 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003100 /* locale encoding with surrogateescape */
3101 wchar_t *wchar;
3102 char *bytes;
3103 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00003104 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003105
3106 wchar = PyUnicode_AsWideCharString(unicode, NULL);
3107 if (wchar == NULL)
3108 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003109 bytes = _Py_wchar2char(wchar, &error_pos);
3110 if (bytes == NULL) {
3111 if (error_pos != (size_t)-1) {
3112 char *errmsg = strerror(errno);
3113 PyObject *exc = NULL;
3114 if (errmsg == NULL)
3115 errmsg = "Py_wchar2char() failed";
3116 raise_encode_exception(&exc,
Martin v. Löwis12be46c2011-11-04 19:04:15 +01003117 "filesystemencoding", unicode,
Victor Stinner2f02a512010-11-08 22:43:46 +00003118 error_pos, error_pos+1,
3119 errmsg);
3120 Py_XDECREF(exc);
3121 }
3122 else
3123 PyErr_NoMemory();
3124 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003125 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003126 }
3127 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003128
3129 bytes_obj = PyBytes_FromString(bytes);
3130 PyMem_Free(bytes);
3131 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00003132 }
Victor Stinnerad158722010-10-27 00:25:46 +00003133#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003134}
3135
Alexander Belopolsky40018472011-02-26 01:02:56 +00003136PyObject *
3137PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003138 const char *encoding,
3139 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003140{
3141 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003142 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003143
Guido van Rossumd57fd912000-03-10 22:53:23 +00003144 if (!PyUnicode_Check(unicode)) {
3145 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003146 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003147 }
Fred Drakee4315f52000-05-09 19:53:39 +00003148
Fred Drakee4315f52000-05-09 19:53:39 +00003149 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003150 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003151 if ((strcmp(lower, "utf-8") == 0) ||
3152 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003153 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003154 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003155 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003156 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003157 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003158 }
Victor Stinner37296e82010-06-10 13:36:23 +00003159 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003160 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003161 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003162 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003163#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003164 else if (strcmp(lower, "mbcs") == 0)
3165 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003166#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003167 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003168 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003169 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003170
3171 /* Encode via the codec registry */
3172 v = PyCodec_Encode(unicode, encoding, errors);
3173 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003174 return NULL;
3175
3176 /* The normal path */
3177 if (PyBytes_Check(v))
3178 return v;
3179
3180 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003181 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003182 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003183 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003184
3185 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3186 "encoder %s returned bytearray instead of bytes",
3187 encoding);
3188 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003189 Py_DECREF(v);
3190 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003191 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003192
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003193 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3194 Py_DECREF(v);
3195 return b;
3196 }
3197
3198 PyErr_Format(PyExc_TypeError,
3199 "encoder did not return a bytes object (type=%.400s)",
3200 Py_TYPE(v)->tp_name);
3201 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003202 return NULL;
3203}
3204
Alexander Belopolsky40018472011-02-26 01:02:56 +00003205PyObject *
3206PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003207 const char *encoding,
3208 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003209{
3210 PyObject *v;
3211
3212 if (!PyUnicode_Check(unicode)) {
3213 PyErr_BadArgument();
3214 goto onError;
3215 }
3216
3217 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003218 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003219
3220 /* Encode via the codec registry */
3221 v = PyCodec_Encode(unicode, encoding, errors);
3222 if (v == NULL)
3223 goto onError;
3224 if (!PyUnicode_Check(v)) {
3225 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003226 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003227 Py_TYPE(v)->tp_name);
3228 Py_DECREF(v);
3229 goto onError;
3230 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003231 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003232
Benjamin Peterson29060642009-01-31 22:14:21 +00003233 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003234 return NULL;
3235}
3236
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003237PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003238PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003239 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003240 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3241}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003242
Christian Heimes5894ba72007-11-04 11:43:14 +00003243PyObject*
3244PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3245{
Victor Stinner99b95382011-07-04 14:23:54 +02003246#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003247 return PyUnicode_DecodeMBCS(s, size, NULL);
3248#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003249 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003250#else
Victor Stinner793b5312011-04-27 00:24:21 +02003251 PyInterpreterState *interp = PyThreadState_GET()->interp;
3252 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3253 cannot use it to encode and decode filenames before it is loaded. Load
3254 the Python codec requires to encode at least its own filename. Use the C
3255 version of the locale codec until the codec registry is initialized and
3256 the Python codec is loaded.
3257
3258 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3259 cannot only rely on it: check also interp->fscodec_initialized for
3260 subinterpreters. */
3261 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003262 return PyUnicode_Decode(s, size,
3263 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003264 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003265 }
3266 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003267 /* locale encoding with surrogateescape */
3268 wchar_t *wchar;
3269 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00003270 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003271
3272 if (s[size] != '\0' || size != strlen(s)) {
3273 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3274 return NULL;
3275 }
3276
Victor Stinner168e1172010-10-16 23:16:16 +00003277 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003278 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00003279 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003280
Victor Stinner168e1172010-10-16 23:16:16 +00003281 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003282 PyMem_Free(wchar);
3283 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003284 }
Victor Stinnerad158722010-10-27 00:25:46 +00003285#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003286}
3287
Martin v. Löwis011e8422009-05-05 04:43:17 +00003288
3289int
3290PyUnicode_FSConverter(PyObject* arg, void* addr)
3291{
3292 PyObject *output = NULL;
3293 Py_ssize_t size;
3294 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003295 if (arg == NULL) {
3296 Py_DECREF(*(PyObject**)addr);
3297 return 1;
3298 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003299 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003300 output = arg;
3301 Py_INCREF(output);
3302 }
3303 else {
3304 arg = PyUnicode_FromObject(arg);
3305 if (!arg)
3306 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003307 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003308 Py_DECREF(arg);
3309 if (!output)
3310 return 0;
3311 if (!PyBytes_Check(output)) {
3312 Py_DECREF(output);
3313 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3314 return 0;
3315 }
3316 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003317 size = PyBytes_GET_SIZE(output);
3318 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003319 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003320 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003321 Py_DECREF(output);
3322 return 0;
3323 }
3324 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003325 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003326}
3327
3328
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003329int
3330PyUnicode_FSDecoder(PyObject* arg, void* addr)
3331{
3332 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003333 if (arg == NULL) {
3334 Py_DECREF(*(PyObject**)addr);
3335 return 1;
3336 }
3337 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003338 if (PyUnicode_READY(arg))
3339 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003340 output = arg;
3341 Py_INCREF(output);
3342 }
3343 else {
3344 arg = PyBytes_FromObject(arg);
3345 if (!arg)
3346 return 0;
3347 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3348 PyBytes_GET_SIZE(arg));
3349 Py_DECREF(arg);
3350 if (!output)
3351 return 0;
3352 if (!PyUnicode_Check(output)) {
3353 Py_DECREF(output);
3354 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3355 return 0;
3356 }
3357 }
Victor Stinner065836e2011-10-27 01:56:33 +02003358 if (PyUnicode_READY(output) < 0) {
3359 Py_DECREF(output);
3360 return 0;
3361 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003362 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003363 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003364 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3365 Py_DECREF(output);
3366 return 0;
3367 }
3368 *(PyObject**)addr = output;
3369 return Py_CLEANUP_SUPPORTED;
3370}
3371
3372
Martin v. Löwis5b222132007-06-10 09:51:05 +00003373char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003374PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003375{
Christian Heimesf3863112007-11-22 07:46:41 +00003376 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003377
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003378 if (!PyUnicode_Check(unicode)) {
3379 PyErr_BadArgument();
3380 return NULL;
3381 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003382 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003383 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003384
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003385 if (PyUnicode_UTF8(unicode) == NULL) {
3386 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003387 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3388 if (bytes == NULL)
3389 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003390 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3391 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003392 Py_DECREF(bytes);
3393 return NULL;
3394 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003395 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3396 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3397 PyBytes_AS_STRING(bytes),
3398 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003399 Py_DECREF(bytes);
3400 }
3401
3402 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003403 *psize = PyUnicode_UTF8_LENGTH(unicode);
3404 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003405}
3406
3407char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003408PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003409{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003410 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3411}
3412
3413#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02003414static int unicode_as_unicode_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003415#endif
3416
3417
3418Py_UNICODE *
3419PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3420{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003421 const unsigned char *one_byte;
3422#if SIZEOF_WCHAR_T == 4
3423 const Py_UCS2 *two_bytes;
3424#else
3425 const Py_UCS4 *four_bytes;
3426 const Py_UCS4 *ucs4_end;
3427 Py_ssize_t num_surrogates;
3428#endif
3429 wchar_t *w;
3430 wchar_t *wchar_end;
3431
3432 if (!PyUnicode_Check(unicode)) {
3433 PyErr_BadArgument();
3434 return NULL;
3435 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003436 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003437 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003438 assert(_PyUnicode_KIND(unicode) != 0);
3439 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003440
3441#ifdef Py_DEBUG
3442 ++unicode_as_unicode_calls;
3443#endif
3444
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003445 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003446#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003447 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3448 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003449 num_surrogates = 0;
3450
3451 for (; four_bytes < ucs4_end; ++four_bytes) {
3452 if (*four_bytes > 0xFFFF)
3453 ++num_surrogates;
3454 }
3455
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003456 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3457 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3458 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003459 PyErr_NoMemory();
3460 return NULL;
3461 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003462 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003463
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003464 w = _PyUnicode_WSTR(unicode);
3465 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3466 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003467 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3468 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003469 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003470 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003471 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3472 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003473 }
3474 else
3475 *w = *four_bytes;
3476
3477 if (w > wchar_end) {
3478 assert(0 && "Miscalculated string end");
3479 }
3480 }
3481 *w = 0;
3482#else
3483 /* sizeof(wchar_t) == 4 */
3484 Py_FatalError("Impossible unicode object state, wstr and str "
3485 "should share memory already.");
3486 return NULL;
3487#endif
3488 }
3489 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003490 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3491 (_PyUnicode_LENGTH(unicode) + 1));
3492 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003493 PyErr_NoMemory();
3494 return NULL;
3495 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003496 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3497 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3498 w = _PyUnicode_WSTR(unicode);
3499 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003500
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003501 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3502 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003503 for (; w < wchar_end; ++one_byte, ++w)
3504 *w = *one_byte;
3505 /* null-terminate the wstr */
3506 *w = 0;
3507 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003508 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003509#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003510 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003511 for (; w < wchar_end; ++two_bytes, ++w)
3512 *w = *two_bytes;
3513 /* null-terminate the wstr */
3514 *w = 0;
3515#else
3516 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003517 PyObject_FREE(_PyUnicode_WSTR(unicode));
3518 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003519 Py_FatalError("Impossible unicode object state, wstr "
3520 "and str should share memory already.");
3521 return NULL;
3522#endif
3523 }
3524 else {
3525 assert(0 && "This should never happen.");
3526 }
3527 }
3528 }
3529 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003530 *size = PyUnicode_WSTR_LENGTH(unicode);
3531 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003532}
3533
Alexander Belopolsky40018472011-02-26 01:02:56 +00003534Py_UNICODE *
3535PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003536{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003537 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003538}
3539
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003540
Alexander Belopolsky40018472011-02-26 01:02:56 +00003541Py_ssize_t
3542PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003543{
3544 if (!PyUnicode_Check(unicode)) {
3545 PyErr_BadArgument();
3546 goto onError;
3547 }
3548 return PyUnicode_GET_SIZE(unicode);
3549
Benjamin Peterson29060642009-01-31 22:14:21 +00003550 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003551 return -1;
3552}
3553
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003554Py_ssize_t
3555PyUnicode_GetLength(PyObject *unicode)
3556{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003557 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003558 PyErr_BadArgument();
3559 return -1;
3560 }
3561
3562 return PyUnicode_GET_LENGTH(unicode);
3563}
3564
3565Py_UCS4
3566PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3567{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003568 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3569 PyErr_BadArgument();
3570 return (Py_UCS4)-1;
3571 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003572 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003573 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003574 return (Py_UCS4)-1;
3575 }
3576 return PyUnicode_READ_CHAR(unicode, index);
3577}
3578
3579int
3580PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3581{
3582 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003583 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003584 return -1;
3585 }
Victor Stinner488fa492011-12-12 00:01:39 +01003586 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003587 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003588 PyErr_SetString(PyExc_IndexError, "string index out of range");
3589 return -1;
3590 }
Victor Stinner488fa492011-12-12 00:01:39 +01003591 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003592 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003593 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3594 index, ch);
3595 return 0;
3596}
3597
Alexander Belopolsky40018472011-02-26 01:02:56 +00003598const char *
3599PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003600{
Victor Stinner42cb4622010-09-01 19:39:01 +00003601 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003602}
3603
Victor Stinner554f3f02010-06-16 23:33:54 +00003604/* create or adjust a UnicodeDecodeError */
3605static void
3606make_decode_exception(PyObject **exceptionObject,
3607 const char *encoding,
3608 const char *input, Py_ssize_t length,
3609 Py_ssize_t startpos, Py_ssize_t endpos,
3610 const char *reason)
3611{
3612 if (*exceptionObject == NULL) {
3613 *exceptionObject = PyUnicodeDecodeError_Create(
3614 encoding, input, length, startpos, endpos, reason);
3615 }
3616 else {
3617 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3618 goto onError;
3619 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3620 goto onError;
3621 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3622 goto onError;
3623 }
3624 return;
3625
3626onError:
3627 Py_DECREF(*exceptionObject);
3628 *exceptionObject = NULL;
3629}
3630
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003631/* error handling callback helper:
3632 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003633 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003634 and adjust various state variables.
3635 return 0 on success, -1 on error
3636*/
3637
Alexander Belopolsky40018472011-02-26 01:02:56 +00003638static int
3639unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003640 const char *encoding, const char *reason,
3641 const char **input, const char **inend, Py_ssize_t *startinpos,
3642 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003643 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003644{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003645 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003646
3647 PyObject *restuple = NULL;
3648 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01003649 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003650 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003651 Py_ssize_t requiredsize;
3652 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003653 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003654 int res = -1;
3655
Victor Stinner596a6c42011-11-09 00:02:18 +01003656 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
3657 outsize = PyUnicode_GET_LENGTH(*output);
3658 else
3659 outsize = _PyUnicode_WSTR_LENGTH(*output);
3660
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003661 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003662 *errorHandler = PyCodec_LookupError(errors);
3663 if (*errorHandler == NULL)
3664 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003665 }
3666
Victor Stinner554f3f02010-06-16 23:33:54 +00003667 make_decode_exception(exceptionObject,
3668 encoding,
3669 *input, *inend - *input,
3670 *startinpos, *endinpos,
3671 reason);
3672 if (*exceptionObject == NULL)
3673 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003674
3675 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3676 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003677 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003678 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003679 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003680 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003681 }
3682 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003683 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003684 if (PyUnicode_READY(repunicode) < 0)
3685 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003686
3687 /* Copy back the bytes variables, which might have been modified by the
3688 callback */
3689 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3690 if (!inputobj)
3691 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003692 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003693 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003694 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003695 *input = PyBytes_AS_STRING(inputobj);
3696 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003697 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003698 /* we can DECREF safely, as the exception has another reference,
3699 so the object won't go away. */
3700 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003701
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003702 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003703 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003704 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003705 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3706 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003707 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003708
Victor Stinner596a6c42011-11-09 00:02:18 +01003709 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
3710 /* need more space? (at least enough for what we
3711 have+the replacement+the rest of the string (starting
3712 at the new input position), so we won't have to check space
3713 when there are no errors in the rest of the string) */
3714 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
3715 requiredsize = *outpos + replen + insize-newpos;
3716 if (requiredsize > outsize) {
3717 if (requiredsize<2*outsize)
3718 requiredsize = 2*outsize;
3719 if (unicode_resize(output, requiredsize) < 0)
3720 goto onError;
3721 }
3722 if (unicode_widen(output, PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003723 goto onError;
Victor Stinner596a6c42011-11-09 00:02:18 +01003724 copy_characters(*output, *outpos, repunicode, 0, replen);
3725 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003726 }
Victor Stinner596a6c42011-11-09 00:02:18 +01003727 else {
3728 wchar_t *repwstr;
3729 Py_ssize_t repwlen;
3730 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
3731 if (repwstr == NULL)
3732 goto onError;
3733 /* need more space? (at least enough for what we
3734 have+the replacement+the rest of the string (starting
3735 at the new input position), so we won't have to check space
3736 when there are no errors in the rest of the string) */
3737 requiredsize = *outpos + repwlen + insize-newpos;
3738 if (requiredsize > outsize) {
3739 if (requiredsize < 2*outsize)
3740 requiredsize = 2*outsize;
3741 if (unicode_resize(output, requiredsize) < 0)
3742 goto onError;
3743 }
3744 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
3745 *outpos += repwlen;
3746 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003747 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003748 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003749
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003750 /* we made it! */
3751 res = 0;
3752
Benjamin Peterson29060642009-01-31 22:14:21 +00003753 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003754 Py_XDECREF(restuple);
3755 return res;
3756}
3757
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003758/* --- UTF-7 Codec -------------------------------------------------------- */
3759
Antoine Pitrou244651a2009-05-04 18:56:13 +00003760/* See RFC2152 for details. We encode conservatively and decode liberally. */
3761
3762/* Three simple macros defining base-64. */
3763
3764/* Is c a base-64 character? */
3765
3766#define IS_BASE64(c) \
3767 (((c) >= 'A' && (c) <= 'Z') || \
3768 ((c) >= 'a' && (c) <= 'z') || \
3769 ((c) >= '0' && (c) <= '9') || \
3770 (c) == '+' || (c) == '/')
3771
3772/* given that c is a base-64 character, what is its base-64 value? */
3773
3774#define FROM_BASE64(c) \
3775 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3776 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3777 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3778 (c) == '+' ? 62 : 63)
3779
3780/* What is the base-64 character of the bottom 6 bits of n? */
3781
3782#define TO_BASE64(n) \
3783 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3784
3785/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3786 * decoded as itself. We are permissive on decoding; the only ASCII
3787 * byte not decoding to itself is the + which begins a base64
3788 * string. */
3789
3790#define DECODE_DIRECT(c) \
3791 ((c) <= 127 && (c) != '+')
3792
3793/* The UTF-7 encoder treats ASCII characters differently according to
3794 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3795 * the above). See RFC2152. This array identifies these different
3796 * sets:
3797 * 0 : "Set D"
3798 * alphanumeric and '(),-./:?
3799 * 1 : "Set O"
3800 * !"#$%&*;<=>@[]^_`{|}
3801 * 2 : "whitespace"
3802 * ht nl cr sp
3803 * 3 : special (must be base64 encoded)
3804 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3805 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003806
Tim Petersced69f82003-09-16 20:30:58 +00003807static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003808char utf7_category[128] = {
3809/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3810 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3811/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3812 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3813/* sp ! " # $ % & ' ( ) * + , - . / */
3814 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3815/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3816 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3817/* @ A B C D E F G H I J K L M N O */
3818 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3819/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3820 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3821/* ` a b c d e f g h i j k l m n o */
3822 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3823/* p q r s t u v w x y z { | } ~ del */
3824 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003825};
3826
Antoine Pitrou244651a2009-05-04 18:56:13 +00003827/* ENCODE_DIRECT: this character should be encoded as itself. The
3828 * answer depends on whether we are encoding set O as itself, and also
3829 * on whether we are encoding whitespace as itself. RFC2152 makes it
3830 * clear that the answers to these questions vary between
3831 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003832
Antoine Pitrou244651a2009-05-04 18:56:13 +00003833#define ENCODE_DIRECT(c, directO, directWS) \
3834 ((c) < 128 && (c) > 0 && \
3835 ((utf7_category[(c)] == 0) || \
3836 (directWS && (utf7_category[(c)] == 2)) || \
3837 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003838
Alexander Belopolsky40018472011-02-26 01:02:56 +00003839PyObject *
3840PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003841 Py_ssize_t size,
3842 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003843{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003844 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3845}
3846
Antoine Pitrou244651a2009-05-04 18:56:13 +00003847/* The decoder. The only state we preserve is our read position,
3848 * i.e. how many characters we have consumed. So if we end in the
3849 * middle of a shift sequence we have to back off the read position
3850 * and the output to the beginning of the sequence, otherwise we lose
3851 * all the shift state (seen bits, number of bits seen, high
3852 * surrogate). */
3853
Alexander Belopolsky40018472011-02-26 01:02:56 +00003854PyObject *
3855PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003856 Py_ssize_t size,
3857 const char *errors,
3858 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003859{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003860 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003861 Py_ssize_t startinpos;
3862 Py_ssize_t endinpos;
3863 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003864 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01003865 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003866 const char *errmsg = "";
3867 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003868 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003869 unsigned int base64bits = 0;
3870 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01003871 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003872 PyObject *errorHandler = NULL;
3873 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003874
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003875 /* Start off assuming it's all ASCII. Widen later as necessary. */
3876 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003877 if (!unicode)
3878 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003879 if (size == 0) {
3880 if (consumed)
3881 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01003882 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003883 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003884
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003885 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003886 e = s + size;
3887
3888 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003889 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003890 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003891 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003892
Antoine Pitrou244651a2009-05-04 18:56:13 +00003893 if (inShift) { /* in a base-64 section */
3894 if (IS_BASE64(ch)) { /* consume a base-64 character */
3895 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3896 base64bits += 6;
3897 s++;
3898 if (base64bits >= 16) {
3899 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01003900 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00003901 base64bits -= 16;
3902 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3903 if (surrogate) {
3904 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01003905 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
3906 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003907 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
3908 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003909 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01003910 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003911 }
3912 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01003913 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
3914 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003915 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003916 }
3917 }
Victor Stinner551ac952011-11-29 22:58:13 +01003918 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003919 /* first surrogate */
3920 surrogate = outCh;
3921 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003922 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003923 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
3924 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003925 }
3926 }
3927 }
3928 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003929 inShift = 0;
3930 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003931 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01003932 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
3933 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01003934 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003935 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003936 if (base64bits > 0) { /* left-over bits */
3937 if (base64bits >= 6) {
3938 /* We've seen at least one base-64 character */
3939 errmsg = "partial character in shift sequence";
3940 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003941 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003942 else {
3943 /* Some bits remain; they should be zero */
3944 if (base64buffer != 0) {
3945 errmsg = "non-zero padding bits in shift sequence";
3946 goto utf7Error;
3947 }
3948 }
3949 }
3950 if (ch != '-') {
3951 /* '-' is absorbed; other terminating
3952 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003953 if (unicode_putchar(&unicode, &outpos, ch) < 0)
3954 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003955 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003956 }
3957 }
3958 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003959 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003960 s++; /* consume '+' */
3961 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003962 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003963 if (unicode_putchar(&unicode, &outpos, '+') < 0)
3964 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003965 }
3966 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003967 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003968 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003969 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003970 }
3971 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003972 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003973 if (unicode_putchar(&unicode, &outpos, ch) < 0)
3974 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003975 s++;
3976 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003977 else {
3978 startinpos = s-starts;
3979 s++;
3980 errmsg = "unexpected special character";
3981 goto utf7Error;
3982 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003983 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003984utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003985 endinpos = s-starts;
3986 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003987 errors, &errorHandler,
3988 "utf7", errmsg,
3989 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003990 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003991 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003992 }
3993
Antoine Pitrou244651a2009-05-04 18:56:13 +00003994 /* end of string */
3995
3996 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3997 /* if we're in an inconsistent state, that's an error */
3998 if (surrogate ||
3999 (base64bits >= 6) ||
4000 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004001 endinpos = size;
4002 if (unicode_decode_call_errorhandler(
4003 errors, &errorHandler,
4004 "utf7", "unterminated shift sequence",
4005 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004006 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004007 goto onError;
4008 if (s < e)
4009 goto restart;
4010 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004011 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004012
4013 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004014 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004015 if (inShift) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004016 outpos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004017 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004018 }
4019 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004020 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004021 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004022 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004023
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004024 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004025 goto onError;
4026
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004027 Py_XDECREF(errorHandler);
4028 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004029 return unicode_result(unicode);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004030
Benjamin Peterson29060642009-01-31 22:14:21 +00004031 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004032 Py_XDECREF(errorHandler);
4033 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004034 Py_DECREF(unicode);
4035 return NULL;
4036}
4037
4038
Alexander Belopolsky40018472011-02-26 01:02:56 +00004039PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004040_PyUnicode_EncodeUTF7(PyObject *str,
4041 int base64SetO,
4042 int base64WhiteSpace,
4043 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004044{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004045 int kind;
4046 void *data;
4047 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004048 PyObject *v;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004049 Py_ssize_t allocated;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004050 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004051 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004052 unsigned int base64bits = 0;
4053 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004054 char * out;
4055 char * start;
4056
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004057 if (PyUnicode_READY(str) < 0)
4058 return NULL;
4059 kind = PyUnicode_KIND(str);
4060 data = PyUnicode_DATA(str);
4061 len = PyUnicode_GET_LENGTH(str);
4062
4063 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004064 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004065
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004066 /* It might be possible to tighten this worst case */
4067 allocated = 8 * len;
4068 if (allocated / 8 != len)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004069 return PyErr_NoMemory();
4070
Antoine Pitrou244651a2009-05-04 18:56:13 +00004071 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004072 if (v == NULL)
4073 return NULL;
4074
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004075 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004076 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004077 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004078
Antoine Pitrou244651a2009-05-04 18:56:13 +00004079 if (inShift) {
4080 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4081 /* shifting out */
4082 if (base64bits) { /* output remaining bits */
4083 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4084 base64buffer = 0;
4085 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004086 }
4087 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004088 /* Characters not in the BASE64 set implicitly unshift the sequence
4089 so no '-' is required, except if the character is itself a '-' */
4090 if (IS_BASE64(ch) || ch == '-') {
4091 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004092 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004093 *out++ = (char) ch;
4094 }
4095 else {
4096 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004097 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004098 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004099 else { /* not in a shift sequence */
4100 if (ch == '+') {
4101 *out++ = '+';
4102 *out++ = '-';
4103 }
4104 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4105 *out++ = (char) ch;
4106 }
4107 else {
4108 *out++ = '+';
4109 inShift = 1;
4110 goto encode_char;
4111 }
4112 }
4113 continue;
4114encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004115 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004116 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004117
Antoine Pitrou244651a2009-05-04 18:56:13 +00004118 /* code first surrogate */
4119 base64bits += 16;
4120 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4121 while (base64bits >= 6) {
4122 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4123 base64bits -= 6;
4124 }
4125 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004126 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004127 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004128 base64bits += 16;
4129 base64buffer = (base64buffer << 16) | ch;
4130 while (base64bits >= 6) {
4131 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4132 base64bits -= 6;
4133 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004134 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004135 if (base64bits)
4136 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4137 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004138 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004139 if (_PyBytes_Resize(&v, out - start) < 0)
4140 return NULL;
4141 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004142}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004143PyObject *
4144PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4145 Py_ssize_t size,
4146 int base64SetO,
4147 int base64WhiteSpace,
4148 const char *errors)
4149{
4150 PyObject *result;
4151 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4152 if (tmp == NULL)
4153 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004154 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004155 base64WhiteSpace, errors);
4156 Py_DECREF(tmp);
4157 return result;
4158}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004159
Antoine Pitrou244651a2009-05-04 18:56:13 +00004160#undef IS_BASE64
4161#undef FROM_BASE64
4162#undef TO_BASE64
4163#undef DECODE_DIRECT
4164#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004165
Guido van Rossumd57fd912000-03-10 22:53:23 +00004166/* --- UTF-8 Codec -------------------------------------------------------- */
4167
Tim Petersced69f82003-09-16 20:30:58 +00004168static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004169char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004170 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4171 illegal prefix. See RFC 3629 for details */
4172 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4173 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004174 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004175 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4176 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4177 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4178 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004179 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4180 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004181 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4182 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004183 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4184 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4185 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4186 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4187 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004188};
4189
Alexander Belopolsky40018472011-02-26 01:02:56 +00004190PyObject *
4191PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004192 Py_ssize_t size,
4193 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004194{
Walter Dörwald69652032004-09-07 20:24:22 +00004195 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4196}
4197
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004198#include "stringlib/ucs1lib.h"
4199#include "stringlib/codecs.h"
4200#include "stringlib/undef.h"
4201
4202#include "stringlib/ucs2lib.h"
4203#include "stringlib/codecs.h"
4204#include "stringlib/undef.h"
4205
4206#include "stringlib/ucs4lib.h"
4207#include "stringlib/codecs.h"
4208#include "stringlib/undef.h"
4209
Antoine Pitrouab868312009-01-10 15:40:25 +00004210/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4211#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4212
4213/* Mask to quickly check whether a C 'long' contains a
4214 non-ASCII, UTF8-encoded char. */
4215#if (SIZEOF_LONG == 8)
4216# define ASCII_CHAR_MASK 0x8080808080808080L
4217#elif (SIZEOF_LONG == 4)
4218# define ASCII_CHAR_MASK 0x80808080L
4219#else
4220# error C 'long' size should be either 4 or 8!
4221#endif
4222
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004223/* Scans a UTF-8 string and returns the maximum character to be expected
4224 and the size of the decoded unicode string.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004225
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004226 This function doesn't check for errors, these checks are performed in
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004227 PyUnicode_DecodeUTF8Stateful.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004228 */
4229static Py_UCS4
Victor Stinnera1d12bb2011-12-11 21:53:09 +01004230utf8_scanner(const unsigned char *p, Py_ssize_t string_size, Py_ssize_t *unicode_size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004231{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004232 Py_ssize_t char_count = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004233 const unsigned char *end = p + string_size;
4234 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004235
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004236 assert(unicode_size != NULL);
4237
4238 /* By having a cascade of independent loops which fallback onto each
4239 other, we minimize the amount of work done in the average loop
4240 iteration, and we also maximize the CPU's ability to predict
4241 branches correctly (because a given condition will have always the
4242 same boolean outcome except perhaps in the last iteration of the
4243 corresponding loop).
4244 In the general case this brings us rather close to decoding
4245 performance pre-PEP 393, despite the two-pass decoding.
4246
4247 Note that the pure ASCII loop is not duplicated once a non-ASCII
4248 character has been encountered. It is actually a pessimization (by
4249 a significant factor) to use this loop on text with many non-ASCII
4250 characters, and it is important to avoid bad performance on valid
4251 utf-8 data (invalid utf-8 being a different can of worms).
4252 */
4253
4254 /* ASCII */
4255 for (; p < end; ++p) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004256 /* Only check value if it's not a ASCII char... */
4257 if (*p < 0x80) {
4258 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4259 an explanation. */
4260 if (!((size_t) p & LONG_PTR_MASK)) {
4261 /* Help register allocation */
4262 register const unsigned char *_p = p;
4263 while (_p < aligned_end) {
4264 unsigned long value = *(unsigned long *) _p;
4265 if (value & ASCII_CHAR_MASK)
4266 break;
4267 _p += SIZEOF_LONG;
4268 char_count += SIZEOF_LONG;
4269 }
4270 p = _p;
4271 if (p == end)
4272 break;
4273 }
4274 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004275 if (*p < 0x80)
4276 ++char_count;
4277 else
4278 goto _ucs1loop;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004279 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004280 *unicode_size = char_count;
4281 return 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004282
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004283_ucs1loop:
4284 for (; p < end; ++p) {
4285 if (*p < 0xc4)
4286 char_count += ((*p & 0xc0) != 0x80);
4287 else
4288 goto _ucs2loop;
4289 }
4290 *unicode_size = char_count;
4291 return 255;
4292
4293_ucs2loop:
4294 for (; p < end; ++p) {
4295 if (*p < 0xf0)
4296 char_count += ((*p & 0xc0) != 0x80);
4297 else
4298 goto _ucs4loop;
4299 }
4300 *unicode_size = char_count;
4301 return 65535;
4302
4303_ucs4loop:
4304 for (; p < end; ++p) {
4305 char_count += ((*p & 0xc0) != 0x80);
4306 }
4307 *unicode_size = char_count;
4308 return 65537;
4309}
4310
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004311/* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
Victor Stinner785938e2011-12-11 20:09:03 +01004312 in case of errors. Implicit parameters: unicode, kind, data, onError.
4313 Potential resizing overallocates, so the result needs to shrink at the end.
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004314*/
Victor Stinner785938e2011-12-11 20:09:03 +01004315#define WRITE_MAYBE_FAIL(index, value) \
4316 do { \
4317 Py_ssize_t pos = index; \
4318 if (pos > PyUnicode_GET_LENGTH(unicode) && \
4319 unicode_resize(&unicode, pos + pos/8) < 0) \
4320 goto onError; \
4321 if (unicode_putchar(&unicode, &pos, value) < 0) \
4322 goto onError; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004323 } while (0)
4324
Victor Stinnerbf6e5602011-12-12 01:53:47 +01004325static PyObject *
Victor Stinner785938e2011-12-11 20:09:03 +01004326decode_utf8_errors(const char *starts,
4327 Py_ssize_t size,
4328 const char *errors,
4329 Py_ssize_t *consumed,
4330 const char *s,
4331 PyObject *unicode,
4332 Py_ssize_t i)
Walter Dörwald69652032004-09-07 20:24:22 +00004333{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004334 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004335 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004336 Py_ssize_t startinpos;
4337 Py_ssize_t endinpos;
Victor Stinner785938e2011-12-11 20:09:03 +01004338 const char *e = starts + size;
4339 const char *aligned_end;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004340 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004341 PyObject *errorHandler = NULL;
4342 PyObject *exc = NULL;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004343
Antoine Pitrouab868312009-01-10 15:40:25 +00004344 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004345
4346 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004347 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004348
4349 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004350 /* Fast path for runs of ASCII characters. Given that common UTF-8
4351 input will consist of an overwhelming majority of ASCII
4352 characters, we try to optimize for this case by checking
4353 as many characters as a C 'long' can contain.
4354 First, check if we can do an aligned read, as most CPUs have
4355 a penalty for unaligned reads.
4356 */
4357 if (!((size_t) s & LONG_PTR_MASK)) {
4358 /* Help register allocation */
4359 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004360 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004361 while (_s < aligned_end) {
4362 /* Read a whole long at a time (either 4 or 8 bytes),
4363 and do a fast unrolled copy if it only contains ASCII
4364 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004365 unsigned long value = *(unsigned long *) _s;
4366 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004367 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004368 WRITE_MAYBE_FAIL(_i+0, _s[0]);
4369 WRITE_MAYBE_FAIL(_i+1, _s[1]);
4370 WRITE_MAYBE_FAIL(_i+2, _s[2]);
4371 WRITE_MAYBE_FAIL(_i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004372#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004373 WRITE_MAYBE_FAIL(_i+4, _s[4]);
4374 WRITE_MAYBE_FAIL(_i+5, _s[5]);
4375 WRITE_MAYBE_FAIL(_i+6, _s[6]);
4376 WRITE_MAYBE_FAIL(_i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004377#endif
4378 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004379 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004380 }
4381 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004382 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004383 if (s == e)
4384 break;
4385 ch = (unsigned char)*s;
4386 }
4387 }
4388
4389 if (ch < 0x80) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004390 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004391 s++;
4392 continue;
4393 }
4394
4395 n = utf8_code_length[ch];
4396
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004397 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004398 if (consumed)
4399 break;
4400 else {
4401 errmsg = "unexpected end of data";
4402 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004403 endinpos = startinpos+1;
4404 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4405 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004406 goto utf8Error;
4407 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004408 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004409
4410 switch (n) {
4411
4412 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004413 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004414 startinpos = s-starts;
4415 endinpos = startinpos+1;
4416 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004417
4418 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004419 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004420 startinpos = s-starts;
4421 endinpos = startinpos+1;
4422 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004423
4424 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004425 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004426 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004427 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004428 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004429 goto utf8Error;
4430 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004431 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004432 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004433 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004434 break;
4435
4436 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004437 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4438 will result in surrogates in range d800-dfff. Surrogates are
4439 not valid UTF-8 so they are rejected.
4440 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4441 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004442 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004443 (s[2] & 0xc0) != 0x80 ||
4444 ((unsigned char)s[0] == 0xE0 &&
4445 (unsigned char)s[1] < 0xA0) ||
4446 ((unsigned char)s[0] == 0xED &&
4447 (unsigned char)s[1] > 0x9F)) {
4448 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004449 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004450 endinpos = startinpos + 1;
4451
4452 /* if s[1] first two bits are 1 and 0, then the invalid
4453 continuation byte is s[2], so increment endinpos by 1,
4454 if not, s[1] is invalid and endinpos doesn't need to
4455 be incremented. */
4456 if ((s[1] & 0xC0) == 0x80)
4457 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004458 goto utf8Error;
4459 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004460 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004461 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004462 WRITE_MAYBE_FAIL(i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004463 break;
4464
4465 case 4:
4466 if ((s[1] & 0xc0) != 0x80 ||
4467 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004468 (s[3] & 0xc0) != 0x80 ||
4469 ((unsigned char)s[0] == 0xF0 &&
4470 (unsigned char)s[1] < 0x90) ||
4471 ((unsigned char)s[0] == 0xF4 &&
4472 (unsigned char)s[1] > 0x8F)) {
4473 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004474 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004475 endinpos = startinpos + 1;
4476 if ((s[1] & 0xC0) == 0x80) {
4477 endinpos++;
4478 if ((s[2] & 0xC0) == 0x80)
4479 endinpos++;
4480 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004481 goto utf8Error;
4482 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004483 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004484 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01004485 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Ezio Melotti57221d02010-07-01 07:32:02 +00004486
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004487 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004488 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004489 }
4490 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004491 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004492
Benjamin Peterson29060642009-01-31 22:14:21 +00004493 utf8Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00004494 if (unicode_decode_call_errorhandler(
4495 errors, &errorHandler,
4496 "utf8", errmsg,
4497 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004498 &unicode, &i))
Benjamin Peterson29060642009-01-31 22:14:21 +00004499 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004500 /* Update data because unicode_decode_call_errorhandler might have
4501 re-created or resized the unicode object. */
Benjamin Peterson29060642009-01-31 22:14:21 +00004502 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004503 }
Walter Dörwald69652032004-09-07 20:24:22 +00004504 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004505 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004506
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004507 /* Adjust length and ready string when it contained errors and
4508 is of the old resizable kind. */
Victor Stinner785938e2011-12-11 20:09:03 +01004509 if (unicode_resize(&unicode, i) < 0)
4510 goto onError;
4511 unicode_adjust_maxchar(&unicode);
4512 if (unicode == NULL)
4513 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004514
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004515 Py_XDECREF(errorHandler);
4516 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004517 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01004518 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004519
Benjamin Peterson29060642009-01-31 22:14:21 +00004520 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004521 Py_XDECREF(errorHandler);
4522 Py_XDECREF(exc);
Victor Stinner785938e2011-12-11 20:09:03 +01004523 Py_XDECREF(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004524 return NULL;
4525}
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004526#undef WRITE_MAYBE_FAIL
Antoine Pitrouab868312009-01-10 15:40:25 +00004527
Victor Stinner785938e2011-12-11 20:09:03 +01004528PyObject *
4529PyUnicode_DecodeUTF8Stateful(const char *s,
4530 Py_ssize_t size,
4531 const char *errors,
4532 Py_ssize_t *consumed)
4533{
4534 Py_UCS4 maxchar = 0;
4535 Py_ssize_t unicode_size;
4536 int has_errors = 0;
4537 PyObject *unicode;
4538 int kind;
4539 void *data;
4540 const char *starts = s;
4541 const char *e;
4542 Py_ssize_t i;
4543
4544 if (size == 0) {
4545 if (consumed)
4546 *consumed = 0;
Victor Stinner382955f2011-12-11 21:44:00 +01004547 Py_INCREF(unicode_empty);
4548 return unicode_empty;
Victor Stinner785938e2011-12-11 20:09:03 +01004549 }
4550
Victor Stinnera1d12bb2011-12-11 21:53:09 +01004551 maxchar = utf8_scanner((const unsigned char *)s, size, &unicode_size);
Victor Stinner785938e2011-12-11 20:09:03 +01004552
4553 /* When the string is ASCII only, just use memcpy and return.
4554 unicode_size may be != size if there is an incomplete UTF-8
4555 sequence at the end of the ASCII block. */
4556 if (maxchar < 128 && size == unicode_size) {
4557 if (consumed)
4558 *consumed = size;
4559 return unicode_fromascii(s, size);
4560 }
4561
4562 unicode = PyUnicode_New(unicode_size, maxchar);
4563 if (!unicode)
4564 return NULL;
4565 kind = PyUnicode_KIND(unicode);
4566 data = PyUnicode_DATA(unicode);
4567
4568 /* Unpack UTF-8 encoded data */
4569 i = 0;
4570 e = starts + size;
4571 switch (kind) {
4572 case PyUnicode_1BYTE_KIND:
4573 has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
4574 break;
4575 case PyUnicode_2BYTE_KIND:
4576 has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
4577 break;
4578 case PyUnicode_4BYTE_KIND:
4579 has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
4580 break;
4581 }
4582 if (!has_errors) {
4583 /* Ensure the unicode size calculation was correct */
4584 assert(i == unicode_size);
4585 assert(s == e);
4586 if (consumed)
4587 *consumed = size;
4588 return unicode;
4589 }
4590
4591 /* In case of errors, maxchar and size computation might be incorrect;
4592 code below refits and resizes as necessary. */
4593 return decode_utf8_errors(starts, size, errors, consumed, s, unicode, i);
4594}
4595
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004596#ifdef __APPLE__
4597
4598/* Simplified UTF-8 decoder using surrogateescape error handler,
4599 used to decode the command line arguments on Mac OS X. */
4600
4601wchar_t*
4602_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4603{
4604 int n;
4605 const char *e;
4606 wchar_t *unicode, *p;
4607
4608 /* Note: size will always be longer than the resulting Unicode
4609 character count */
4610 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4611 PyErr_NoMemory();
4612 return NULL;
4613 }
4614 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4615 if (!unicode)
4616 return NULL;
4617
4618 /* Unpack UTF-8 encoded data */
4619 p = unicode;
4620 e = s + size;
4621 while (s < e) {
4622 Py_UCS4 ch = (unsigned char)*s;
4623
4624 if (ch < 0x80) {
4625 *p++ = (wchar_t)ch;
4626 s++;
4627 continue;
4628 }
4629
4630 n = utf8_code_length[ch];
4631 if (s + n > e) {
4632 goto surrogateescape;
4633 }
4634
4635 switch (n) {
4636 case 0:
4637 case 1:
4638 goto surrogateescape;
4639
4640 case 2:
4641 if ((s[1] & 0xc0) != 0x80)
4642 goto surrogateescape;
4643 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4644 assert ((ch > 0x007F) && (ch <= 0x07FF));
4645 *p++ = (wchar_t)ch;
4646 break;
4647
4648 case 3:
4649 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4650 will result in surrogates in range d800-dfff. Surrogates are
4651 not valid UTF-8 so they are rejected.
4652 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4653 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4654 if ((s[1] & 0xc0) != 0x80 ||
4655 (s[2] & 0xc0) != 0x80 ||
4656 ((unsigned char)s[0] == 0xE0 &&
4657 (unsigned char)s[1] < 0xA0) ||
4658 ((unsigned char)s[0] == 0xED &&
4659 (unsigned char)s[1] > 0x9F)) {
4660
4661 goto surrogateescape;
4662 }
4663 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4664 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004665 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004666 break;
4667
4668 case 4:
4669 if ((s[1] & 0xc0) != 0x80 ||
4670 (s[2] & 0xc0) != 0x80 ||
4671 (s[3] & 0xc0) != 0x80 ||
4672 ((unsigned char)s[0] == 0xF0 &&
4673 (unsigned char)s[1] < 0x90) ||
4674 ((unsigned char)s[0] == 0xF4 &&
4675 (unsigned char)s[1] > 0x8F)) {
4676 goto surrogateescape;
4677 }
4678 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4679 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01004680 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004681
4682#if SIZEOF_WCHAR_T == 4
4683 *p++ = (wchar_t)ch;
4684#else
4685 /* compute and append the two surrogates: */
Victor Stinner551ac952011-11-29 22:58:13 +01004686 *p++ = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4687 *p++ = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004688#endif
4689 break;
4690 }
4691 s += n;
4692 continue;
4693
4694 surrogateescape:
4695 *p++ = 0xDC00 + ch;
4696 s++;
4697 }
4698 *p = L'\0';
4699 return unicode;
4700}
4701
4702#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004703
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004704/* Primary internal function which creates utf8 encoded bytes objects.
4705
4706 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004707 and allocate exactly as much space needed at the end. Else allocate the
4708 maximum possible needed (4 result bytes per Unicode character), and return
4709 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004710*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004711PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004712_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004713{
Tim Peters602f7402002-04-27 18:03:26 +00004714#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004715
Guido van Rossum98297ee2007-11-06 21:34:58 +00004716 Py_ssize_t i; /* index into s of next input byte */
4717 PyObject *result; /* result string object */
4718 char *p; /* next free byte in output buffer */
4719 Py_ssize_t nallocated; /* number of result bytes allocated */
4720 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004721 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004722 PyObject *errorHandler = NULL;
4723 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004724 int kind;
4725 void *data;
4726 Py_ssize_t size;
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004727 PyObject *rep = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004728
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004729 if (!PyUnicode_Check(unicode)) {
4730 PyErr_BadArgument();
4731 return NULL;
4732 }
4733
4734 if (PyUnicode_READY(unicode) == -1)
4735 return NULL;
4736
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004737 if (PyUnicode_UTF8(unicode))
4738 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4739 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004740
4741 kind = PyUnicode_KIND(unicode);
4742 data = PyUnicode_DATA(unicode);
4743 size = PyUnicode_GET_LENGTH(unicode);
4744
Tim Peters602f7402002-04-27 18:03:26 +00004745 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004746
Tim Peters602f7402002-04-27 18:03:26 +00004747 if (size <= MAX_SHORT_UNICHARS) {
4748 /* Write into the stack buffer; nallocated can't overflow.
4749 * At the end, we'll allocate exactly as much heap space as it
4750 * turns out we need.
4751 */
4752 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004753 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004754 p = stackbuf;
4755 }
4756 else {
4757 /* Overallocate on the heap, and give the excess back at the end. */
4758 nallocated = size * 4;
4759 if (nallocated / 4 != size) /* overflow! */
4760 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004761 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004762 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004763 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004764 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004765 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004766
Tim Peters602f7402002-04-27 18:03:26 +00004767 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004768 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004769
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004770 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004771 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004772 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004773
Guido van Rossumd57fd912000-03-10 22:53:23 +00004774 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004775 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004776 *p++ = (char)(0xc0 | (ch >> 6));
4777 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner551ac952011-11-29 22:58:13 +01004778 } else if (Py_UNICODE_IS_SURROGATE(ch)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004779 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004780 Py_ssize_t repsize, k, startpos;
4781 startpos = i-1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004782 rep = unicode_encode_call_errorhandler(
4783 errors, &errorHandler, "utf-8", "surrogates not allowed",
Victor Stinner7931d9a2011-11-04 00:22:48 +01004784 unicode, &exc, startpos, startpos+1, &newpos);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004785 if (!rep)
4786 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004787
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004788 if (PyBytes_Check(rep))
4789 repsize = PyBytes_GET_SIZE(rep);
4790 else
Victor Stinner9e30aa52011-11-21 02:49:52 +01004791 repsize = PyUnicode_GET_LENGTH(rep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004792
4793 if (repsize > 4) {
4794 Py_ssize_t offset;
4795
4796 if (result == NULL)
4797 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004798 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004799 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004800
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004801 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4802 /* integer overflow */
4803 PyErr_NoMemory();
4804 goto error;
4805 }
4806 nallocated += repsize - 4;
4807 if (result != NULL) {
4808 if (_PyBytes_Resize(&result, nallocated) < 0)
4809 goto error;
4810 } else {
4811 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004812 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004813 goto error;
4814 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4815 }
4816 p = PyBytes_AS_STRING(result) + offset;
4817 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004818
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004819 if (PyBytes_Check(rep)) {
4820 char *prep = PyBytes_AS_STRING(rep);
4821 for(k = repsize; k > 0; k--)
4822 *p++ = *prep++;
4823 } else /* rep is unicode */ {
Victor Stinnera98b28c2011-11-10 20:21:49 +01004824 enum PyUnicode_Kind repkind;
4825 void *repdata;
4826
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004827 if (PyUnicode_READY(rep) < 0)
Victor Stinnera98b28c2011-11-10 20:21:49 +01004828 goto error;
Victor Stinnera98b28c2011-11-10 20:21:49 +01004829 repkind = PyUnicode_KIND(rep);
4830 repdata = PyUnicode_DATA(rep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004831
4832 for(k=0; k<repsize; k++) {
Victor Stinnera98b28c2011-11-10 20:21:49 +01004833 Py_UCS4 c = PyUnicode_READ(repkind, repdata, k);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004834 if (0x80 <= c) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01004835 raise_encode_exception(&exc, "utf-8",
Victor Stinner7931d9a2011-11-04 00:22:48 +01004836 unicode,
Martin v. Löwis9e816682011-11-02 12:45:42 +01004837 i-1, i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004838 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004839 goto error;
4840 }
Victor Stinnera98b28c2011-11-10 20:21:49 +01004841 *p++ = (char)c;
Victor Stinner31be90b2010-04-22 19:38:16 +00004842 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004843 }
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004844 Py_CLEAR(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004845 } else if (ch < 0x10000) {
4846 *p++ = (char)(0xe0 | (ch >> 12));
4847 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4848 *p++ = (char)(0x80 | (ch & 0x3f));
4849 } else /* ch >= 0x10000 */ {
Victor Stinner8faf8212011-12-08 22:14:11 +01004850 assert(ch <= MAX_UNICODE);
Tim Peters602f7402002-04-27 18:03:26 +00004851 /* Encode UCS4 Unicode ordinals */
4852 *p++ = (char)(0xf0 | (ch >> 18));
4853 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4854 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4855 *p++ = (char)(0x80 | (ch & 0x3f));
4856 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004857 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004858
Guido van Rossum98297ee2007-11-06 21:34:58 +00004859 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004860 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004861 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004862 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004863 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004864 }
4865 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004866 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004867 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004868 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004869 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004870 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004871
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004872 Py_XDECREF(errorHandler);
4873 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004874 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004875 error:
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004876 Py_XDECREF(rep);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004877 Py_XDECREF(errorHandler);
4878 Py_XDECREF(exc);
4879 Py_XDECREF(result);
4880 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004881
Tim Peters602f7402002-04-27 18:03:26 +00004882#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004883}
4884
Alexander Belopolsky40018472011-02-26 01:02:56 +00004885PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004886PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4887 Py_ssize_t size,
4888 const char *errors)
4889{
4890 PyObject *v, *unicode;
4891
4892 unicode = PyUnicode_FromUnicode(s, size);
4893 if (unicode == NULL)
4894 return NULL;
4895 v = _PyUnicode_AsUTF8String(unicode, errors);
4896 Py_DECREF(unicode);
4897 return v;
4898}
4899
4900PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004901PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004902{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004903 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004904}
4905
Walter Dörwald41980ca2007-08-16 21:55:45 +00004906/* --- UTF-32 Codec ------------------------------------------------------- */
4907
4908PyObject *
4909PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004910 Py_ssize_t size,
4911 const char *errors,
4912 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004913{
4914 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4915}
4916
4917PyObject *
4918PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004919 Py_ssize_t size,
4920 const char *errors,
4921 int *byteorder,
4922 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004923{
4924 const char *starts = s;
4925 Py_ssize_t startinpos;
4926 Py_ssize_t endinpos;
4927 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004928 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004929 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004930 int bo = 0; /* assume native ordering by default */
4931 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004932 /* Offsets from q for retrieving bytes in the right order. */
4933#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4934 int iorder[] = {0, 1, 2, 3};
4935#else
4936 int iorder[] = {3, 2, 1, 0};
4937#endif
4938 PyObject *errorHandler = NULL;
4939 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004940
Walter Dörwald41980ca2007-08-16 21:55:45 +00004941 q = (unsigned char *)s;
4942 e = q + size;
4943
4944 if (byteorder)
4945 bo = *byteorder;
4946
4947 /* Check for BOM marks (U+FEFF) in the input and adjust current
4948 byte order setting accordingly. In native mode, the leading BOM
4949 mark is skipped, in all other modes, it is copied to the output
4950 stream as-is (giving a ZWNBSP character). */
4951 if (bo == 0) {
4952 if (size >= 4) {
4953 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004954 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004955#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004956 if (bom == 0x0000FEFF) {
4957 q += 4;
4958 bo = -1;
4959 }
4960 else if (bom == 0xFFFE0000) {
4961 q += 4;
4962 bo = 1;
4963 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004964#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004965 if (bom == 0x0000FEFF) {
4966 q += 4;
4967 bo = 1;
4968 }
4969 else if (bom == 0xFFFE0000) {
4970 q += 4;
4971 bo = -1;
4972 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004973#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004974 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004975 }
4976
4977 if (bo == -1) {
4978 /* force LE */
4979 iorder[0] = 0;
4980 iorder[1] = 1;
4981 iorder[2] = 2;
4982 iorder[3] = 3;
4983 }
4984 else if (bo == 1) {
4985 /* force BE */
4986 iorder[0] = 3;
4987 iorder[1] = 2;
4988 iorder[2] = 1;
4989 iorder[3] = 0;
4990 }
4991
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004992 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004993 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004994 if (!unicode)
4995 return NULL;
4996 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01004997 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004998 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004999
Walter Dörwald41980ca2007-08-16 21:55:45 +00005000 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005001 Py_UCS4 ch;
5002 /* remaining bytes at the end? (size should be divisible by 4) */
5003 if (e-q<4) {
5004 if (consumed)
5005 break;
5006 errmsg = "truncated data";
5007 startinpos = ((const char *)q)-starts;
5008 endinpos = ((const char *)e)-starts;
5009 goto utf32Error;
5010 /* The remaining input chars are ignored if the callback
5011 chooses to skip the input */
5012 }
5013 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5014 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005015
Benjamin Peterson29060642009-01-31 22:14:21 +00005016 if (ch >= 0x110000)
5017 {
5018 errmsg = "codepoint not in range(0x110000)";
5019 startinpos = ((const char *)q)-starts;
5020 endinpos = startinpos+4;
5021 goto utf32Error;
5022 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005023 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5024 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005025 q += 4;
5026 continue;
5027 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005028 if (unicode_decode_call_errorhandler(
5029 errors, &errorHandler,
5030 "utf32", errmsg,
5031 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005032 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005033 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005034 }
5035
5036 if (byteorder)
5037 *byteorder = bo;
5038
5039 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005040 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005041
5042 /* Adjust length */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005043 if (PyUnicode_Resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005044 goto onError;
5045
5046 Py_XDECREF(errorHandler);
5047 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005048 return unicode_result(unicode);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005049
Benjamin Peterson29060642009-01-31 22:14:21 +00005050 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005051 Py_DECREF(unicode);
5052 Py_XDECREF(errorHandler);
5053 Py_XDECREF(exc);
5054 return NULL;
5055}
5056
5057PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005058_PyUnicode_EncodeUTF32(PyObject *str,
5059 const char *errors,
5060 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005061{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005062 int kind;
5063 void *data;
5064 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005065 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005066 unsigned char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005067 Py_ssize_t nsize, bytesize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005068 /* Offsets from p for storing byte pairs in the right order. */
5069#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5070 int iorder[] = {0, 1, 2, 3};
5071#else
5072 int iorder[] = {3, 2, 1, 0};
5073#endif
5074
Benjamin Peterson29060642009-01-31 22:14:21 +00005075#define STORECHAR(CH) \
5076 do { \
5077 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5078 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5079 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5080 p[iorder[0]] = (CH) & 0xff; \
5081 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005082 } while(0)
5083
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005084 if (!PyUnicode_Check(str)) {
5085 PyErr_BadArgument();
5086 return NULL;
5087 }
5088 if (PyUnicode_READY(str) < 0)
5089 return NULL;
5090 kind = PyUnicode_KIND(str);
5091 data = PyUnicode_DATA(str);
5092 len = PyUnicode_GET_LENGTH(str);
5093
5094 nsize = len + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005095 bytesize = nsize * 4;
5096 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005097 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005098 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005099 if (v == NULL)
5100 return NULL;
5101
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005102 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005103 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005104 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005105 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005106 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005107
5108 if (byteorder == -1) {
5109 /* force LE */
5110 iorder[0] = 0;
5111 iorder[1] = 1;
5112 iorder[2] = 2;
5113 iorder[3] = 3;
5114 }
5115 else if (byteorder == 1) {
5116 /* force BE */
5117 iorder[0] = 3;
5118 iorder[1] = 2;
5119 iorder[2] = 1;
5120 iorder[3] = 0;
5121 }
5122
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005123 for (i = 0; i < len; i++)
5124 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005125
5126 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005127 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005128#undef STORECHAR
5129}
5130
Alexander Belopolsky40018472011-02-26 01:02:56 +00005131PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005132PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5133 Py_ssize_t size,
5134 const char *errors,
5135 int byteorder)
5136{
5137 PyObject *result;
5138 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5139 if (tmp == NULL)
5140 return NULL;
5141 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5142 Py_DECREF(tmp);
5143 return result;
5144}
5145
5146PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005147PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005148{
Victor Stinnerb960b342011-11-20 19:12:52 +01005149 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005150}
5151
Guido van Rossumd57fd912000-03-10 22:53:23 +00005152/* --- UTF-16 Codec ------------------------------------------------------- */
5153
Tim Peters772747b2001-08-09 22:21:55 +00005154PyObject *
5155PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005156 Py_ssize_t size,
5157 const char *errors,
5158 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005159{
Walter Dörwald69652032004-09-07 20:24:22 +00005160 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5161}
5162
Antoine Pitrouab868312009-01-10 15:40:25 +00005163/* Two masks for fast checking of whether a C 'long' may contain
5164 UTF16-encoded surrogate characters. This is an efficient heuristic,
5165 assuming that non-surrogate characters with a code point >= 0x8000 are
5166 rare in most input.
5167 FAST_CHAR_MASK is used when the input is in native byte ordering,
5168 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005169*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005170#if (SIZEOF_LONG == 8)
5171# define FAST_CHAR_MASK 0x8000800080008000L
5172# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
5173#elif (SIZEOF_LONG == 4)
5174# define FAST_CHAR_MASK 0x80008000L
5175# define SWAPPED_FAST_CHAR_MASK 0x00800080L
5176#else
5177# error C 'long' size should be either 4 or 8!
5178#endif
5179
Walter Dörwald69652032004-09-07 20:24:22 +00005180PyObject *
5181PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005182 Py_ssize_t size,
5183 const char *errors,
5184 int *byteorder,
5185 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005186{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005187 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005188 Py_ssize_t startinpos;
5189 Py_ssize_t endinpos;
5190 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005191 PyObject *unicode;
Antoine Pitrouab868312009-01-10 15:40:25 +00005192 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005193 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005194 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005195 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005196 /* Offsets from q for retrieving byte pairs in the right order. */
5197#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5198 int ihi = 1, ilo = 0;
5199#else
5200 int ihi = 0, ilo = 1;
5201#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005202 PyObject *errorHandler = NULL;
5203 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005204
5205 /* Note: size will always be longer than the resulting Unicode
5206 character count */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005207 unicode = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005208 if (!unicode)
5209 return NULL;
5210 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005211 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005212 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005213
Tim Peters772747b2001-08-09 22:21:55 +00005214 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005215 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005216
5217 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005218 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005219
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005220 /* Check for BOM marks (U+FEFF) in the input and adjust current
5221 byte order setting accordingly. In native mode, the leading BOM
5222 mark is skipped, in all other modes, it is copied to the output
5223 stream as-is (giving a ZWNBSP character). */
5224 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005225 if (size >= 2) {
Victor Stinner24729f32011-11-10 20:31:37 +01005226 const Py_UCS4 bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005227#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005228 if (bom == 0xFEFF) {
5229 q += 2;
5230 bo = -1;
5231 }
5232 else if (bom == 0xFFFE) {
5233 q += 2;
5234 bo = 1;
5235 }
Tim Petersced69f82003-09-16 20:30:58 +00005236#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005237 if (bom == 0xFEFF) {
5238 q += 2;
5239 bo = 1;
5240 }
5241 else if (bom == 0xFFFE) {
5242 q += 2;
5243 bo = -1;
5244 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005245#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005246 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005247 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005248
Tim Peters772747b2001-08-09 22:21:55 +00005249 if (bo == -1) {
5250 /* force LE */
5251 ihi = 1;
5252 ilo = 0;
5253 }
5254 else if (bo == 1) {
5255 /* force BE */
5256 ihi = 0;
5257 ilo = 1;
5258 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005259#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5260 native_ordering = ilo < ihi;
5261#else
5262 native_ordering = ilo > ihi;
5263#endif
Tim Peters772747b2001-08-09 22:21:55 +00005264
Antoine Pitrouab868312009-01-10 15:40:25 +00005265 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005266 while (q < e) {
Victor Stinner24729f32011-11-10 20:31:37 +01005267 Py_UCS4 ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005268 /* First check for possible aligned read of a C 'long'. Unaligned
5269 reads are more expensive, better to defer to another iteration. */
5270 if (!((size_t) q & LONG_PTR_MASK)) {
5271 /* Fast path for runs of non-surrogate chars. */
5272 register const unsigned char *_q = q;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005273 int kind = PyUnicode_KIND(unicode);
5274 void *data = PyUnicode_DATA(unicode);
5275 while (_q < aligned_end) {
5276 unsigned long block = * (unsigned long *) _q;
5277 unsigned short *pblock = (unsigned short*)&block;
5278 Py_UCS4 maxch;
5279 if (native_ordering) {
5280 /* Can use buffer directly */
5281 if (block & FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005282 break;
Antoine Pitrouab868312009-01-10 15:40:25 +00005283 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005284 else {
5285 /* Need to byte-swap */
5286 unsigned char *_p = (unsigned char*)pblock;
5287 if (block & SWAPPED_FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005288 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005289 _p[0] = _q[1];
5290 _p[1] = _q[0];
5291 _p[2] = _q[3];
5292 _p[3] = _q[2];
Antoine Pitrouab868312009-01-10 15:40:25 +00005293#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005294 _p[4] = _q[5];
5295 _p[5] = _q[4];
5296 _p[6] = _q[7];
5297 _p[7] = _q[6];
Antoine Pitrouab868312009-01-10 15:40:25 +00005298#endif
Antoine Pitrouab868312009-01-10 15:40:25 +00005299 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005300 maxch = Py_MAX(pblock[0], pblock[1]);
5301#if SIZEOF_LONG == 8
5302 maxch = Py_MAX(maxch, Py_MAX(pblock[2], pblock[3]));
5303#endif
5304 if (maxch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
5305 if (unicode_widen(&unicode, maxch) < 0)
5306 goto onError;
5307 kind = PyUnicode_KIND(unicode);
5308 data = PyUnicode_DATA(unicode);
5309 }
5310 PyUnicode_WRITE(kind, data, outpos++, pblock[0]);
5311 PyUnicode_WRITE(kind, data, outpos++, pblock[1]);
5312#if SIZEOF_LONG == 8
5313 PyUnicode_WRITE(kind, data, outpos++, pblock[2]);
5314 PyUnicode_WRITE(kind, data, outpos++, pblock[3]);
5315#endif
5316 _q += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00005317 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005318 q = _q;
5319 if (q >= e)
5320 break;
5321 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005322 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005323
Benjamin Peterson14339b62009-01-31 16:36:08 +00005324 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005325
Victor Stinner551ac952011-11-29 22:58:13 +01005326 if (!Py_UNICODE_IS_SURROGATE(ch)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005327 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5328 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005329 continue;
5330 }
5331
5332 /* UTF-16 code pair: */
5333 if (q > e) {
5334 errmsg = "unexpected end of data";
5335 startinpos = (((const char *)q) - 2) - starts;
5336 endinpos = ((const char *)e) + 1 - starts;
5337 goto utf16Error;
5338 }
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005339 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)) {
5340 Py_UCS4 ch2 = (q[ihi] << 8) | q[ilo];
Benjamin Peterson29060642009-01-31 22:14:21 +00005341 q += 2;
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005342 if (Py_UNICODE_IS_LOW_SURROGATE(ch2)) {
Victor Stinner62aa4d02011-11-09 00:03:45 +01005343 if (unicode_putchar(&unicode, &outpos,
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005344 Py_UNICODE_JOIN_SURROGATES(ch, ch2)) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005345 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005346 continue;
5347 }
5348 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005349 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005350 startinpos = (((const char *)q)-4)-starts;
5351 endinpos = startinpos+2;
5352 goto utf16Error;
5353 }
5354
Benjamin Peterson14339b62009-01-31 16:36:08 +00005355 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005356 errmsg = "illegal encoding";
5357 startinpos = (((const char *)q)-2)-starts;
5358 endinpos = startinpos+2;
5359 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005360
Benjamin Peterson29060642009-01-31 22:14:21 +00005361 utf16Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005362 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005363 errors,
5364 &errorHandler,
5365 "utf16", errmsg,
5366 &starts,
5367 (const char **)&e,
5368 &startinpos,
5369 &endinpos,
5370 &exc,
5371 (const char **)&q,
5372 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005373 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005374 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005375 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005376 /* remaining byte at the end? (size should be even) */
5377 if (e == q) {
5378 if (!consumed) {
5379 errmsg = "truncated data";
5380 startinpos = ((const char *)q) - starts;
5381 endinpos = ((const char *)e) + 1 - starts;
Antoine Pitrouab868312009-01-10 15:40:25 +00005382 if (unicode_decode_call_errorhandler(
5383 errors,
5384 &errorHandler,
5385 "utf16", errmsg,
5386 &starts,
5387 (const char **)&e,
5388 &startinpos,
5389 &endinpos,
5390 &exc,
5391 (const char **)&q,
5392 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005393 &outpos))
Antoine Pitrouab868312009-01-10 15:40:25 +00005394 goto onError;
5395 /* The remaining input chars are ignored if the callback
5396 chooses to skip the input */
5397 }
5398 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005399
5400 if (byteorder)
5401 *byteorder = bo;
5402
Walter Dörwald69652032004-09-07 20:24:22 +00005403 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005404 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005405
Guido van Rossumd57fd912000-03-10 22:53:23 +00005406 /* Adjust length */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005407 if (PyUnicode_Resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005408 goto onError;
5409
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005410 Py_XDECREF(errorHandler);
5411 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005412 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005413
Benjamin Peterson29060642009-01-31 22:14:21 +00005414 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005415 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005416 Py_XDECREF(errorHandler);
5417 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005418 return NULL;
5419}
5420
Antoine Pitrouab868312009-01-10 15:40:25 +00005421#undef FAST_CHAR_MASK
5422#undef SWAPPED_FAST_CHAR_MASK
5423
Tim Peters772747b2001-08-09 22:21:55 +00005424PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005425_PyUnicode_EncodeUTF16(PyObject *str,
5426 const char *errors,
5427 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005428{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005429 int kind;
5430 void *data;
5431 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005432 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005433 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005434 Py_ssize_t nsize, bytesize;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005435 Py_ssize_t i, pairs;
Tim Peters772747b2001-08-09 22:21:55 +00005436 /* Offsets from p for storing byte pairs in the right order. */
5437#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5438 int ihi = 1, ilo = 0;
5439#else
5440 int ihi = 0, ilo = 1;
5441#endif
5442
Benjamin Peterson29060642009-01-31 22:14:21 +00005443#define STORECHAR(CH) \
5444 do { \
5445 p[ihi] = ((CH) >> 8) & 0xff; \
5446 p[ilo] = (CH) & 0xff; \
5447 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005448 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005449
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005450 if (!PyUnicode_Check(str)) {
5451 PyErr_BadArgument();
5452 return NULL;
5453 }
5454 if (PyUnicode_READY(str) < 0)
5455 return NULL;
5456 kind = PyUnicode_KIND(str);
5457 data = PyUnicode_DATA(str);
5458 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005459
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005460 pairs = 0;
5461 if (kind == PyUnicode_4BYTE_KIND)
5462 for (i = 0; i < len; i++)
5463 if (PyUnicode_READ(kind, data, i) >= 0x10000)
5464 pairs++;
5465 /* 2 * (len + pairs + (byteorder == 0)) */
5466 if (len > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005467 return PyErr_NoMemory();
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005468 nsize = len + pairs + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005469 bytesize = nsize * 2;
5470 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005471 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005472 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005473 if (v == NULL)
5474 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005475
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005476 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005477 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005478 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005479 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005480 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005481
5482 if (byteorder == -1) {
5483 /* force LE */
5484 ihi = 1;
5485 ilo = 0;
5486 }
5487 else if (byteorder == 1) {
5488 /* force BE */
5489 ihi = 0;
5490 ilo = 1;
5491 }
5492
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005493 for (i = 0; i < len; i++) {
5494 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5495 Py_UCS4 ch2 = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +00005496 if (ch >= 0x10000) {
Victor Stinner551ac952011-11-29 22:58:13 +01005497 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
5498 ch = Py_UNICODE_HIGH_SURROGATE(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00005499 }
Tim Peters772747b2001-08-09 22:21:55 +00005500 STORECHAR(ch);
5501 if (ch2)
5502 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005503 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005504
5505 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005506 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005507#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005508}
5509
Alexander Belopolsky40018472011-02-26 01:02:56 +00005510PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005511PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5512 Py_ssize_t size,
5513 const char *errors,
5514 int byteorder)
5515{
5516 PyObject *result;
5517 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5518 if (tmp == NULL)
5519 return NULL;
5520 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5521 Py_DECREF(tmp);
5522 return result;
5523}
5524
5525PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005526PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005527{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005528 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005529}
5530
5531/* --- Unicode Escape Codec ----------------------------------------------- */
5532
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005533/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5534 if all the escapes in the string make it still a valid ASCII string.
5535 Returns -1 if any escapes were found which cause the string to
5536 pop out of ASCII range. Otherwise returns the length of the
5537 required buffer to hold the string.
5538 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005539static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005540length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5541{
5542 const unsigned char *p = (const unsigned char *)s;
5543 const unsigned char *end = p + size;
5544 Py_ssize_t length = 0;
5545
5546 if (size < 0)
5547 return -1;
5548
5549 for (; p < end; ++p) {
5550 if (*p > 127) {
5551 /* Non-ASCII */
5552 return -1;
5553 }
5554 else if (*p != '\\') {
5555 /* Normal character */
5556 ++length;
5557 }
5558 else {
5559 /* Backslash-escape, check next char */
5560 ++p;
5561 /* Escape sequence reaches till end of string or
5562 non-ASCII follow-up. */
5563 if (p >= end || *p > 127)
5564 return -1;
5565 switch (*p) {
5566 case '\n':
5567 /* backslash + \n result in zero characters */
5568 break;
5569 case '\\': case '\'': case '\"':
5570 case 'b': case 'f': case 't':
5571 case 'n': case 'r': case 'v': case 'a':
5572 ++length;
5573 break;
5574 case '0': case '1': case '2': case '3':
5575 case '4': case '5': case '6': case '7':
5576 case 'x': case 'u': case 'U': case 'N':
5577 /* these do not guarantee ASCII characters */
5578 return -1;
5579 default:
5580 /* count the backslash + the other character */
5581 length += 2;
5582 }
5583 }
5584 }
5585 return length;
5586}
5587
Fredrik Lundh06d12682001-01-24 07:59:11 +00005588static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005589
Alexander Belopolsky40018472011-02-26 01:02:56 +00005590PyObject *
5591PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005592 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005593 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005594{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005595 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005596 Py_ssize_t startinpos;
5597 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005598 int j;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005599 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005600 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005601 char* message;
5602 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005603 PyObject *errorHandler = NULL;
5604 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005605 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005606 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005607
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005608 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005609
5610 /* After length_of_escaped_ascii_string() there are two alternatives,
5611 either the string is pure ASCII with named escapes like \n, etc.
5612 and we determined it's exact size (common case)
5613 or it contains \x, \u, ... escape sequences. then we create a
5614 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005615 if (len >= 0) {
5616 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005617 if (!v)
5618 goto onError;
5619 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005620 }
5621 else {
5622 /* Escaped strings will always be longer than the resulting
5623 Unicode string, so we start with size here and then reduce the
5624 length after conversion to the true value.
5625 (but if the error callback returns a long replacement string
5626 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005627 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005628 if (!v)
5629 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005630 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005631 }
5632
Guido van Rossumd57fd912000-03-10 22:53:23 +00005633 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005634 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005635 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005636 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005637
Guido van Rossumd57fd912000-03-10 22:53:23 +00005638 while (s < end) {
5639 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005640 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005641 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005642
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005643 /* The only case in which i == ascii_length is a backslash
5644 followed by a newline. */
5645 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005646
Guido van Rossumd57fd912000-03-10 22:53:23 +00005647 /* Non-escape characters are interpreted as Unicode ordinals */
5648 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005649 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5650 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005651 continue;
5652 }
5653
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005654 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005655 /* \ - Escapes */
5656 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005657 c = *s++;
5658 if (s > end)
5659 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005660
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005661 /* The only case in which i == ascii_length is a backslash
5662 followed by a newline. */
5663 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005664
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005665 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005666
Benjamin Peterson29060642009-01-31 22:14:21 +00005667 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005668#define WRITECHAR(ch) \
5669 do { \
5670 if (unicode_putchar(&v, &i, ch) < 0) \
5671 goto onError; \
5672 }while(0)
5673
Guido van Rossumd57fd912000-03-10 22:53:23 +00005674 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005675 case '\\': WRITECHAR('\\'); break;
5676 case '\'': WRITECHAR('\''); break;
5677 case '\"': WRITECHAR('\"'); break;
5678 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005679 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005680 case 'f': WRITECHAR('\014'); break;
5681 case 't': WRITECHAR('\t'); break;
5682 case 'n': WRITECHAR('\n'); break;
5683 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005684 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005685 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005686 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005687 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005688
Benjamin Peterson29060642009-01-31 22:14:21 +00005689 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005690 case '0': case '1': case '2': case '3':
5691 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005692 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005693 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005694 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005695 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005696 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005697 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005698 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699 break;
5700
Benjamin Peterson29060642009-01-31 22:14:21 +00005701 /* hex escapes */
5702 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005703 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005704 digits = 2;
5705 message = "truncated \\xXX escape";
5706 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005707
Benjamin Peterson29060642009-01-31 22:14:21 +00005708 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005709 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005710 digits = 4;
5711 message = "truncated \\uXXXX escape";
5712 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005713
Benjamin Peterson29060642009-01-31 22:14:21 +00005714 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005715 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005716 digits = 8;
5717 message = "truncated \\UXXXXXXXX escape";
5718 hexescape:
5719 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005720 if (s+digits>end) {
5721 endinpos = size;
5722 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005723 errors, &errorHandler,
5724 "unicodeescape", "end of string in escape sequence",
5725 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005726 &v, &i))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005727 goto onError;
5728 goto nextByte;
5729 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005730 for (j = 0; j < digits; ++j) {
5731 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005732 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005733 endinpos = (s+j+1)-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005734 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005735 errors, &errorHandler,
5736 "unicodeescape", message,
5737 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005738 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005739 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005740 len = PyUnicode_GET_LENGTH(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005741 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005742 }
5743 chr = (chr<<4) & ~0xF;
5744 if (c >= '0' && c <= '9')
5745 chr += c - '0';
5746 else if (c >= 'a' && c <= 'f')
5747 chr += 10 + c - 'a';
5748 else
5749 chr += 10 + c - 'A';
5750 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005751 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005752 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005753 /* _decoding_error will have already written into the
5754 target buffer. */
5755 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005756 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005757 /* when we get here, chr is a 32-bit unicode character */
Victor Stinner8faf8212011-12-08 22:14:11 +01005758 if (chr <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005759 WRITECHAR(chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005760 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005761 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005762 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005763 errors, &errorHandler,
5764 "unicodeescape", "illegal Unicode character",
5765 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005766 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005767 goto onError;
5768 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005769 break;
5770
Benjamin Peterson29060642009-01-31 22:14:21 +00005771 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005772 case 'N':
5773 message = "malformed \\N character escape";
5774 if (ucnhash_CAPI == NULL) {
5775 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005776 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5777 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005778 if (ucnhash_CAPI == NULL)
5779 goto ucnhashError;
5780 }
5781 if (*s == '{') {
5782 const char *start = s+1;
5783 /* look for the closing brace */
5784 while (*s != '}' && s < end)
5785 s++;
5786 if (s > start && s < end && *s == '}') {
5787 /* found a name. look it up in the unicode database */
5788 message = "unknown Unicode character name";
5789 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005790 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005791 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005792 goto store;
5793 }
5794 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005795 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005796 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005797 errors, &errorHandler,
5798 "unicodeescape", message,
5799 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005800 &v, &i))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005801 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005802 break;
5803
5804 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005805 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005806 message = "\\ at end of string";
5807 s--;
5808 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005809 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005810 errors, &errorHandler,
5811 "unicodeescape", message,
5812 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005813 &v, &i))
Walter Dörwald8c077222002-03-25 11:16:18 +00005814 goto onError;
5815 }
5816 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005817 WRITECHAR('\\');
5818 WRITECHAR(s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005819 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005820 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005821 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005822 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005823 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005824 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005825#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005826
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005827 if (PyUnicode_Resize(&v, i) < 0)
5828 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005829 Py_XDECREF(errorHandler);
5830 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005831 return unicode_result(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005832
Benjamin Peterson29060642009-01-31 22:14:21 +00005833 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005834 PyErr_SetString(
5835 PyExc_UnicodeError,
5836 "\\N escapes not supported (can't load unicodedata module)"
5837 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005838 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005839 Py_XDECREF(errorHandler);
5840 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005841 return NULL;
5842
Benjamin Peterson29060642009-01-31 22:14:21 +00005843 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005844 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005845 Py_XDECREF(errorHandler);
5846 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005847 return NULL;
5848}
5849
5850/* Return a Unicode-Escape string version of the Unicode object.
5851
5852 If quotes is true, the string is enclosed in u"" or u'' quotes as
5853 appropriate.
5854
5855*/
5856
Alexander Belopolsky40018472011-02-26 01:02:56 +00005857PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005858PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005859{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005860 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005861 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005862 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005863 int kind;
5864 void *data;
5865 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005866
Thomas Wouters89f507f2006-12-13 04:49:30 +00005867 /* Initial allocation is based on the longest-possible unichr
5868 escape.
5869
5870 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5871 unichr, so in this case it's the longest unichr escape. In
5872 narrow (UTF-16) builds this is five chars per source unichr
5873 since there are two unichrs in the surrogate pair, so in narrow
5874 (UTF-16) builds it's not the longest unichr escape.
5875
5876 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5877 so in the narrow (UTF-16) build case it's the longest unichr
5878 escape.
5879 */
5880
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005881 if (!PyUnicode_Check(unicode)) {
5882 PyErr_BadArgument();
5883 return NULL;
5884 }
5885 if (PyUnicode_READY(unicode) < 0)
5886 return NULL;
5887 len = PyUnicode_GET_LENGTH(unicode);
5888 kind = PyUnicode_KIND(unicode);
5889 data = PyUnicode_DATA(unicode);
5890 switch(kind) {
5891 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5892 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5893 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5894 }
5895
5896 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005897 return PyBytes_FromStringAndSize(NULL, 0);
5898
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005899 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005900 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005901
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005902 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005903 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005904 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005905 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005906 if (repr == NULL)
5907 return NULL;
5908
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005909 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005910
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005911 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005912 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005913
Walter Dörwald79e913e2007-05-12 11:08:06 +00005914 /* Escape backslashes */
5915 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916 *p++ = '\\';
5917 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005918 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005919 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005920
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005921 /* Map 21-bit characters to '\U00xxxxxx' */
5922 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005923 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005924 *p++ = '\\';
5925 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005926 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5927 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5928 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5929 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5930 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5931 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5932 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5933 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005934 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005935 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005936
Guido van Rossumd57fd912000-03-10 22:53:23 +00005937 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005938 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005939 *p++ = '\\';
5940 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005941 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5942 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5943 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5944 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005945 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005946
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005947 /* Map special whitespace to '\t', \n', '\r' */
5948 else if (ch == '\t') {
5949 *p++ = '\\';
5950 *p++ = 't';
5951 }
5952 else if (ch == '\n') {
5953 *p++ = '\\';
5954 *p++ = 'n';
5955 }
5956 else if (ch == '\r') {
5957 *p++ = '\\';
5958 *p++ = 'r';
5959 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005960
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005961 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005962 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005963 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005964 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005965 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5966 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005967 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005968
Guido van Rossumd57fd912000-03-10 22:53:23 +00005969 /* Copy everything else as-is */
5970 else
5971 *p++ = (char) ch;
5972 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005973
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005974 assert(p - PyBytes_AS_STRING(repr) > 0);
5975 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5976 return NULL;
5977 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005978}
5979
Alexander Belopolsky40018472011-02-26 01:02:56 +00005980PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005981PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5982 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005983{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005984 PyObject *result;
5985 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5986 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005987 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005988 result = PyUnicode_AsUnicodeEscapeString(tmp);
5989 Py_DECREF(tmp);
5990 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005991}
5992
5993/* --- Raw Unicode Escape Codec ------------------------------------------- */
5994
Alexander Belopolsky40018472011-02-26 01:02:56 +00005995PyObject *
5996PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005997 Py_ssize_t size,
5998 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005999{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006000 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006001 Py_ssize_t startinpos;
6002 Py_ssize_t endinpos;
6003 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006004 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006005 const char *end;
6006 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006007 PyObject *errorHandler = NULL;
6008 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006009
Guido van Rossumd57fd912000-03-10 22:53:23 +00006010 /* Escaped strings will always be longer than the resulting
6011 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006012 length after conversion to the true value. (But decoding error
6013 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006014 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006015 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006016 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006017 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006018 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006019 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006020 end = s + size;
6021 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006022 unsigned char c;
6023 Py_UCS4 x;
6024 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006025 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006026
Benjamin Peterson29060642009-01-31 22:14:21 +00006027 /* Non-escape characters are interpreted as Unicode ordinals */
6028 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006029 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6030 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006031 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006032 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006033 startinpos = s-starts;
6034
6035 /* \u-escapes are only interpreted iff the number of leading
6036 backslashes if odd */
6037 bs = s;
6038 for (;s < end;) {
6039 if (*s != '\\')
6040 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006041 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6042 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006043 }
6044 if (((s - bs) & 1) == 0 ||
6045 s >= end ||
6046 (*s != 'u' && *s != 'U')) {
6047 continue;
6048 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006049 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006050 count = *s=='u' ? 4 : 8;
6051 s++;
6052
6053 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006054 for (x = 0, i = 0; i < count; ++i, ++s) {
6055 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006056 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006057 endinpos = s-starts;
6058 if (unicode_decode_call_errorhandler(
6059 errors, &errorHandler,
6060 "rawunicodeescape", "truncated \\uXXXX",
6061 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006062 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006063 goto onError;
6064 goto nextByte;
6065 }
6066 x = (x<<4) & ~0xF;
6067 if (c >= '0' && c <= '9')
6068 x += c - '0';
6069 else if (c >= 'a' && c <= 'f')
6070 x += 10 + c - 'a';
6071 else
6072 x += 10 + c - 'A';
6073 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006074 if (x <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006075 if (unicode_putchar(&v, &outpos, x) < 0)
6076 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006077 } else {
6078 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006079 if (unicode_decode_call_errorhandler(
6080 errors, &errorHandler,
6081 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006082 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006083 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006084 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006085 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006086 nextByte:
6087 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006088 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006089 if (PyUnicode_Resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006090 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006091 Py_XDECREF(errorHandler);
6092 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006093 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00006094
Benjamin Peterson29060642009-01-31 22:14:21 +00006095 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006096 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006097 Py_XDECREF(errorHandler);
6098 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006099 return NULL;
6100}
6101
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006102
Alexander Belopolsky40018472011-02-26 01:02:56 +00006103PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006104PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006105{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006106 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006107 char *p;
6108 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006109 Py_ssize_t expandsize, pos;
6110 int kind;
6111 void *data;
6112 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006113
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006114 if (!PyUnicode_Check(unicode)) {
6115 PyErr_BadArgument();
6116 return NULL;
6117 }
6118 if (PyUnicode_READY(unicode) < 0)
6119 return NULL;
6120 kind = PyUnicode_KIND(unicode);
6121 data = PyUnicode_DATA(unicode);
6122 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006123 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6124 bytes, and 1 byte characters 4. */
6125 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006126
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006127 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006128 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006129
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006130 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006131 if (repr == NULL)
6132 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006133 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006134 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006135
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006136 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006137 for (pos = 0; pos < len; pos++) {
6138 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006139 /* Map 32-bit characters to '\Uxxxxxxxx' */
6140 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006141 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006142 *p++ = '\\';
6143 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006144 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6145 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6146 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6147 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6148 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6149 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6150 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6151 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006152 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006153 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006154 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006155 *p++ = '\\';
6156 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006157 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6158 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6159 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6160 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006161 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006162 /* Copy everything else as-is */
6163 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006164 *p++ = (char) ch;
6165 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006166
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006167 assert(p > q);
6168 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006169 return NULL;
6170 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006171}
6172
Alexander Belopolsky40018472011-02-26 01:02:56 +00006173PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006174PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6175 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006176{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006177 PyObject *result;
6178 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6179 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006180 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006181 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6182 Py_DECREF(tmp);
6183 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006184}
6185
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006186/* --- Unicode Internal Codec ------------------------------------------- */
6187
Alexander Belopolsky40018472011-02-26 01:02:56 +00006188PyObject *
6189_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006190 Py_ssize_t size,
6191 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006192{
6193 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006194 Py_ssize_t startinpos;
6195 Py_ssize_t endinpos;
6196 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006197 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006198 const char *end;
6199 const char *reason;
6200 PyObject *errorHandler = NULL;
6201 PyObject *exc = NULL;
6202
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006203 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006204 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006205 1))
6206 return NULL;
6207
Thomas Wouters89f507f2006-12-13 04:49:30 +00006208 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006209 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006210 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006211 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006212 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006213 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006214 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006215 end = s + size;
6216
6217 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006218 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006219 Py_UCS4 ch;
6220 /* We copy the raw representation one byte at a time because the
6221 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006222 ((char *) &uch)[0] = s[0];
6223 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006224#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006225 ((char *) &uch)[2] = s[2];
6226 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006227#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006228 ch = uch;
6229
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006230 /* We have to sanity check the raw data, otherwise doom looms for
6231 some malformed UCS-4 data. */
6232 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006233#ifdef Py_UNICODE_WIDE
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006234 ch > 0x10ffff ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006235#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006236 end-s < Py_UNICODE_SIZE
6237 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006238 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006239 startinpos = s - starts;
6240 if (end-s < Py_UNICODE_SIZE) {
6241 endinpos = end-starts;
6242 reason = "truncated input";
6243 }
6244 else {
6245 endinpos = s - starts + Py_UNICODE_SIZE;
6246 reason = "illegal code point (> 0x10FFFF)";
6247 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006248 if (unicode_decode_call_errorhandler(
6249 errors, &errorHandler,
6250 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006251 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006252 &v, &outpos))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006253 goto onError;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006254 continue;
6255 }
6256
6257 s += Py_UNICODE_SIZE;
6258#ifndef Py_UNICODE_WIDE
Victor Stinner551ac952011-11-29 22:58:13 +01006259 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006260 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006261 Py_UNICODE uch2;
6262 ((char *) &uch2)[0] = s[0];
6263 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006264 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006265 {
Victor Stinner551ac952011-11-29 22:58:13 +01006266 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006267 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006268 }
6269 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006270#endif
6271
6272 if (unicode_putchar(&v, &outpos, ch) < 0)
6273 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006274 }
6275
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006276 if (PyUnicode_Resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006277 goto onError;
6278 Py_XDECREF(errorHandler);
6279 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006280 return unicode_result(v);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006281
Benjamin Peterson29060642009-01-31 22:14:21 +00006282 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006283 Py_XDECREF(v);
6284 Py_XDECREF(errorHandler);
6285 Py_XDECREF(exc);
6286 return NULL;
6287}
6288
Guido van Rossumd57fd912000-03-10 22:53:23 +00006289/* --- Latin-1 Codec ------------------------------------------------------ */
6290
Alexander Belopolsky40018472011-02-26 01:02:56 +00006291PyObject *
6292PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006293 Py_ssize_t size,
6294 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006295{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006296 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006297 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006298}
6299
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006300/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006301static void
6302make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006303 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006304 PyObject *unicode,
6305 Py_ssize_t startpos, Py_ssize_t endpos,
6306 const char *reason)
6307{
6308 if (*exceptionObject == NULL) {
6309 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006310 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006311 encoding, unicode, startpos, endpos, reason);
6312 }
6313 else {
6314 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6315 goto onError;
6316 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6317 goto onError;
6318 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6319 goto onError;
6320 return;
6321 onError:
6322 Py_DECREF(*exceptionObject);
6323 *exceptionObject = NULL;
6324 }
6325}
6326
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006327/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006328static void
6329raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006330 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006331 PyObject *unicode,
6332 Py_ssize_t startpos, Py_ssize_t endpos,
6333 const char *reason)
6334{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006335 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006336 encoding, unicode, startpos, endpos, reason);
6337 if (*exceptionObject != NULL)
6338 PyCodec_StrictErrors(*exceptionObject);
6339}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006340
6341/* error handling callback helper:
6342 build arguments, call the callback and check the arguments,
6343 put the result into newpos and return the replacement string, which
6344 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006345static PyObject *
6346unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006347 PyObject **errorHandler,
6348 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006349 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006350 Py_ssize_t startpos, Py_ssize_t endpos,
6351 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006352{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006353 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006354 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006355 PyObject *restuple;
6356 PyObject *resunicode;
6357
6358 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006359 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006360 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006361 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006362 }
6363
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006364 if (PyUnicode_READY(unicode) < 0)
6365 return NULL;
6366 len = PyUnicode_GET_LENGTH(unicode);
6367
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006368 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006369 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006370 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006371 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006372
6373 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006374 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006375 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006376 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006377 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006378 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006379 Py_DECREF(restuple);
6380 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006381 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006382 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006383 &resunicode, newpos)) {
6384 Py_DECREF(restuple);
6385 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006386 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006387 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6388 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6389 Py_DECREF(restuple);
6390 return NULL;
6391 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006392 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006393 *newpos = len + *newpos;
6394 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006395 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6396 Py_DECREF(restuple);
6397 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006398 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006399 Py_INCREF(resunicode);
6400 Py_DECREF(restuple);
6401 return resunicode;
6402}
6403
Alexander Belopolsky40018472011-02-26 01:02:56 +00006404static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006405unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006406 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006407 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006408{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006409 /* input state */
6410 Py_ssize_t pos=0, size;
6411 int kind;
6412 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006413 /* output object */
6414 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006415 /* pointer into the output */
6416 char *str;
6417 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006418 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006419 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6420 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006421 PyObject *errorHandler = NULL;
6422 PyObject *exc = NULL;
6423 /* the following variable is used for caching string comparisons
6424 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6425 int known_errorHandler = -1;
6426
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006427 if (PyUnicode_READY(unicode) < 0)
6428 return NULL;
6429 size = PyUnicode_GET_LENGTH(unicode);
6430 kind = PyUnicode_KIND(unicode);
6431 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006432 /* allocate enough for a simple encoding without
6433 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006434 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006435 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006436 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006437 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006438 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006439 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006440 ressize = size;
6441
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006442 while (pos < size) {
6443 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006444
Benjamin Peterson29060642009-01-31 22:14:21 +00006445 /* can we encode this? */
6446 if (c<limit) {
6447 /* no overflow check, because we know that the space is enough */
6448 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006449 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006450 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006451 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006452 Py_ssize_t requiredsize;
6453 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006454 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006455 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006456 Py_ssize_t collstart = pos;
6457 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006458 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006459 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006460 ++collend;
6461 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6462 if (known_errorHandler==-1) {
6463 if ((errors==NULL) || (!strcmp(errors, "strict")))
6464 known_errorHandler = 1;
6465 else if (!strcmp(errors, "replace"))
6466 known_errorHandler = 2;
6467 else if (!strcmp(errors, "ignore"))
6468 known_errorHandler = 3;
6469 else if (!strcmp(errors, "xmlcharrefreplace"))
6470 known_errorHandler = 4;
6471 else
6472 known_errorHandler = 0;
6473 }
6474 switch (known_errorHandler) {
6475 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006476 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006477 goto onError;
6478 case 2: /* replace */
6479 while (collstart++<collend)
6480 *str++ = '?'; /* fall through */
6481 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006482 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006483 break;
6484 case 4: /* xmlcharrefreplace */
6485 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006486 /* determine replacement size */
6487 for (i = collstart, repsize = 0; i < collend; ++i) {
6488 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6489 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006490 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006491 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006492 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006493 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006494 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006495 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006496 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006497 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006498 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006499 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006500 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006501 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006502 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006503 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006504 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006505 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006506 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006507 if (requiredsize > ressize) {
6508 if (requiredsize<2*ressize)
6509 requiredsize = 2*ressize;
6510 if (_PyBytes_Resize(&res, requiredsize))
6511 goto onError;
6512 str = PyBytes_AS_STRING(res) + respos;
6513 ressize = requiredsize;
6514 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006515 /* generate replacement */
6516 for (i = collstart; i < collend; ++i) {
6517 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006518 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006519 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006520 break;
6521 default:
6522 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006523 encoding, reason, unicode, &exc,
6524 collstart, collend, &newpos);
6525 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
6526 PyUnicode_READY(repunicode) < 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00006527 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006528 if (PyBytes_Check(repunicode)) {
6529 /* Directly copy bytes result to output. */
6530 repsize = PyBytes_Size(repunicode);
6531 if (repsize > 1) {
6532 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006533 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006534 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6535 Py_DECREF(repunicode);
6536 goto onError;
6537 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006538 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006539 ressize += repsize-1;
6540 }
6541 memcpy(str, PyBytes_AsString(repunicode), repsize);
6542 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006543 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006544 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006545 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006546 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006547 /* need more space? (at least enough for what we
6548 have+the replacement+the rest of the string, so
6549 we won't have to check space for encodable characters) */
6550 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006551 repsize = PyUnicode_GET_LENGTH(repunicode);
6552 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006553 if (requiredsize > ressize) {
6554 if (requiredsize<2*ressize)
6555 requiredsize = 2*ressize;
6556 if (_PyBytes_Resize(&res, requiredsize)) {
6557 Py_DECREF(repunicode);
6558 goto onError;
6559 }
6560 str = PyBytes_AS_STRING(res) + respos;
6561 ressize = requiredsize;
6562 }
6563 /* check if there is anything unencodable in the replacement
6564 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006565 for (i = 0; repsize-->0; ++i, ++str) {
6566 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006567 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006568 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006569 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006570 Py_DECREF(repunicode);
6571 goto onError;
6572 }
6573 *str = (char)c;
6574 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006575 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006576 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006577 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006578 }
6579 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006580 /* Resize if we allocated to much */
6581 size = str - PyBytes_AS_STRING(res);
6582 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006583 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006584 if (_PyBytes_Resize(&res, size) < 0)
6585 goto onError;
6586 }
6587
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006588 Py_XDECREF(errorHandler);
6589 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006590 return res;
6591
6592 onError:
6593 Py_XDECREF(res);
6594 Py_XDECREF(errorHandler);
6595 Py_XDECREF(exc);
6596 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006597}
6598
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006599/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006600PyObject *
6601PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006602 Py_ssize_t size,
6603 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006604{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006605 PyObject *result;
6606 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6607 if (unicode == NULL)
6608 return NULL;
6609 result = unicode_encode_ucs1(unicode, errors, 256);
6610 Py_DECREF(unicode);
6611 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006612}
6613
Alexander Belopolsky40018472011-02-26 01:02:56 +00006614PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006615_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006616{
6617 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006618 PyErr_BadArgument();
6619 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006620 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006621 if (PyUnicode_READY(unicode) == -1)
6622 return NULL;
6623 /* Fast path: if it is a one-byte string, construct
6624 bytes object directly. */
6625 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6626 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6627 PyUnicode_GET_LENGTH(unicode));
6628 /* Non-Latin-1 characters present. Defer to above function to
6629 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006630 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006631}
6632
6633PyObject*
6634PyUnicode_AsLatin1String(PyObject *unicode)
6635{
6636 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006637}
6638
6639/* --- 7-bit ASCII Codec -------------------------------------------------- */
6640
Alexander Belopolsky40018472011-02-26 01:02:56 +00006641PyObject *
6642PyUnicode_DecodeASCII(const char *s,
6643 Py_ssize_t size,
6644 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006645{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006646 const char *starts = s;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006647 PyObject *v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006648 int kind;
6649 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006650 Py_ssize_t startinpos;
6651 Py_ssize_t endinpos;
6652 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006653 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006654 int has_error;
6655 const unsigned char *p = (const unsigned char *)s;
6656 const unsigned char *end = p + size;
6657 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006658 PyObject *errorHandler = NULL;
6659 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006660
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006661 if (size == 0) {
6662 Py_INCREF(unicode_empty);
6663 return unicode_empty;
6664 }
6665
Guido van Rossumd57fd912000-03-10 22:53:23 +00006666 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006667 if (size == 1 && (unsigned char)s[0] < 128)
6668 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006669
Victor Stinner702c7342011-10-05 13:50:52 +02006670 has_error = 0;
6671 while (p < end && !has_error) {
6672 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6673 an explanation. */
6674 if (!((size_t) p & LONG_PTR_MASK)) {
6675 /* Help register allocation */
6676 register const unsigned char *_p = p;
6677 while (_p < aligned_end) {
6678 unsigned long value = *(unsigned long *) _p;
6679 if (value & ASCII_CHAR_MASK) {
6680 has_error = 1;
6681 break;
6682 }
6683 _p += SIZEOF_LONG;
6684 }
6685 if (_p == end)
6686 break;
6687 if (has_error)
6688 break;
6689 p = _p;
6690 }
6691 if (*p & 0x80) {
6692 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006693 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006694 }
6695 else {
6696 ++p;
6697 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006698 }
Victor Stinner702c7342011-10-05 13:50:52 +02006699 if (!has_error)
6700 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006701
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006702 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006703 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006704 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006705 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006706 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006707 kind = PyUnicode_KIND(v);
6708 data = PyUnicode_DATA(v);
6709 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006710 e = s + size;
6711 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006712 register unsigned char c = (unsigned char)*s;
6713 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006714 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006715 ++s;
6716 }
6717 else {
6718 startinpos = s-starts;
6719 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006720 if (unicode_decode_call_errorhandler(
6721 errors, &errorHandler,
6722 "ascii", "ordinal not in range(128)",
6723 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006724 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006725 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006726 kind = PyUnicode_KIND(v);
6727 data = PyUnicode_DATA(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006728 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006729 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006730 if (PyUnicode_Resize(&v, outpos) < 0)
6731 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006732 Py_XDECREF(errorHandler);
6733 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006734 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01006735 return v;
Tim Petersced69f82003-09-16 20:30:58 +00006736
Benjamin Peterson29060642009-01-31 22:14:21 +00006737 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006738 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006739 Py_XDECREF(errorHandler);
6740 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006741 return NULL;
6742}
6743
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006744/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006745PyObject *
6746PyUnicode_EncodeASCII(const Py_UNICODE *p,
6747 Py_ssize_t size,
6748 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006749{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006750 PyObject *result;
6751 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6752 if (unicode == NULL)
6753 return NULL;
6754 result = unicode_encode_ucs1(unicode, errors, 128);
6755 Py_DECREF(unicode);
6756 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006757}
6758
Alexander Belopolsky40018472011-02-26 01:02:56 +00006759PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006760_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006761{
6762 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006763 PyErr_BadArgument();
6764 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006765 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006766 if (PyUnicode_READY(unicode) == -1)
6767 return NULL;
6768 /* Fast path: if it is an ASCII-only string, construct bytes object
6769 directly. Else defer to above function to raise the exception. */
6770 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6771 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6772 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006773 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006774}
6775
6776PyObject *
6777PyUnicode_AsASCIIString(PyObject *unicode)
6778{
6779 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006780}
6781
Victor Stinner99b95382011-07-04 14:23:54 +02006782#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006783
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006784/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006785
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006786#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006787#define NEED_RETRY
6788#endif
6789
Victor Stinner3a50e702011-10-18 21:21:00 +02006790#ifndef WC_ERR_INVALID_CHARS
6791# define WC_ERR_INVALID_CHARS 0x0080
6792#endif
6793
6794static char*
6795code_page_name(UINT code_page, PyObject **obj)
6796{
6797 *obj = NULL;
6798 if (code_page == CP_ACP)
6799 return "mbcs";
6800 if (code_page == CP_UTF7)
6801 return "CP_UTF7";
6802 if (code_page == CP_UTF8)
6803 return "CP_UTF8";
6804
6805 *obj = PyBytes_FromFormat("cp%u", code_page);
6806 if (*obj == NULL)
6807 return NULL;
6808 return PyBytes_AS_STRING(*obj);
6809}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006810
Alexander Belopolsky40018472011-02-26 01:02:56 +00006811static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006812is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006813{
6814 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006815 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006816
Victor Stinner3a50e702011-10-18 21:21:00 +02006817 if (!IsDBCSLeadByteEx(code_page, *curr))
6818 return 0;
6819
6820 prev = CharPrevExA(code_page, s, curr, 0);
6821 if (prev == curr)
6822 return 1;
6823 /* FIXME: This code is limited to "true" double-byte encodings,
6824 as it assumes an incomplete character consists of a single
6825 byte. */
6826 if (curr - prev == 2)
6827 return 1;
6828 if (!IsDBCSLeadByteEx(code_page, *prev))
6829 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006830 return 0;
6831}
6832
Victor Stinner3a50e702011-10-18 21:21:00 +02006833static DWORD
6834decode_code_page_flags(UINT code_page)
6835{
6836 if (code_page == CP_UTF7) {
6837 /* The CP_UTF7 decoder only supports flags=0 */
6838 return 0;
6839 }
6840 else
6841 return MB_ERR_INVALID_CHARS;
6842}
6843
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006844/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006845 * Decode a byte string from a Windows code page into unicode object in strict
6846 * mode.
6847 *
6848 * Returns consumed size if succeed, returns -2 on decode error, or raise a
6849 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006850 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006851static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006852decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006853 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006854 const char *in,
6855 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006856{
Victor Stinner3a50e702011-10-18 21:21:00 +02006857 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006858 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006859 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006860
6861 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006862 assert(insize > 0);
6863 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6864 if (outsize <= 0)
6865 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006866
6867 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006868 /* Create unicode object */
Victor Stinner76a31a62011-11-04 00:05:13 +01006869 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006870 if (*v == NULL)
6871 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006872 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006873 }
6874 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006875 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006876 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner76a31a62011-11-04 00:05:13 +01006877 if (PyUnicode_Resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006878 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006879 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006880 }
6881
6882 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006883 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6884 if (outsize <= 0)
6885 goto error;
6886 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006887
Victor Stinner3a50e702011-10-18 21:21:00 +02006888error:
6889 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6890 return -2;
6891 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006892 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006893}
6894
Victor Stinner3a50e702011-10-18 21:21:00 +02006895/*
6896 * Decode a byte string from a code page into unicode object with an error
6897 * handler.
6898 *
6899 * Returns consumed size if succeed, or raise a WindowsError or
6900 * UnicodeDecodeError exception and returns -1 on error.
6901 */
6902static int
6903decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006904 PyObject **v,
6905 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006906 const char *errors)
6907{
6908 const char *startin = in;
6909 const char *endin = in + size;
6910 const DWORD flags = decode_code_page_flags(code_page);
6911 /* Ideally, we should get reason from FormatMessage. This is the Windows
6912 2000 English version of the message. */
6913 const char *reason = "No mapping for the Unicode character exists "
6914 "in the target code page.";
6915 /* each step cannot decode more than 1 character, but a character can be
6916 represented as a surrogate pair */
6917 wchar_t buffer[2], *startout, *out;
6918 int insize, outsize;
6919 PyObject *errorHandler = NULL;
6920 PyObject *exc = NULL;
6921 PyObject *encoding_obj = NULL;
6922 char *encoding;
6923 DWORD err;
6924 int ret = -1;
6925
6926 assert(size > 0);
6927
6928 encoding = code_page_name(code_page, &encoding_obj);
6929 if (encoding == NULL)
6930 return -1;
6931
6932 if (errors == NULL || strcmp(errors, "strict") == 0) {
6933 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6934 UnicodeDecodeError. */
6935 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6936 if (exc != NULL) {
6937 PyCodec_StrictErrors(exc);
6938 Py_CLEAR(exc);
6939 }
6940 goto error;
6941 }
6942
6943 if (*v == NULL) {
6944 /* Create unicode object */
6945 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6946 PyErr_NoMemory();
6947 goto error;
6948 }
Victor Stinner76a31a62011-11-04 00:05:13 +01006949 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006950 if (*v == NULL)
6951 goto error;
6952 startout = PyUnicode_AS_UNICODE(*v);
6953 }
6954 else {
6955 /* Extend unicode object */
6956 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6957 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6958 PyErr_NoMemory();
6959 goto error;
6960 }
Victor Stinner76a31a62011-11-04 00:05:13 +01006961 if (PyUnicode_Resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006962 goto error;
6963 startout = PyUnicode_AS_UNICODE(*v) + n;
6964 }
6965
6966 /* Decode the byte string character per character */
6967 out = startout;
6968 while (in < endin)
6969 {
6970 /* Decode a character */
6971 insize = 1;
6972 do
6973 {
6974 outsize = MultiByteToWideChar(code_page, flags,
6975 in, insize,
6976 buffer, Py_ARRAY_LENGTH(buffer));
6977 if (outsize > 0)
6978 break;
6979 err = GetLastError();
6980 if (err != ERROR_NO_UNICODE_TRANSLATION
6981 && err != ERROR_INSUFFICIENT_BUFFER)
6982 {
6983 PyErr_SetFromWindowsErr(0);
6984 goto error;
6985 }
6986 insize++;
6987 }
6988 /* 4=maximum length of a UTF-8 sequence */
6989 while (insize <= 4 && (in + insize) <= endin);
6990
6991 if (outsize <= 0) {
6992 Py_ssize_t startinpos, endinpos, outpos;
6993
6994 startinpos = in - startin;
6995 endinpos = startinpos + 1;
6996 outpos = out - PyUnicode_AS_UNICODE(*v);
6997 if (unicode_decode_call_errorhandler(
6998 errors, &errorHandler,
6999 encoding, reason,
7000 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007001 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007002 {
7003 goto error;
7004 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007005 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007006 }
7007 else {
7008 in += insize;
7009 memcpy(out, buffer, outsize * sizeof(wchar_t));
7010 out += outsize;
7011 }
7012 }
7013
7014 /* write a NUL character at the end */
7015 *out = 0;
7016
7017 /* Extend unicode object */
7018 outsize = out - startout;
7019 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner76a31a62011-11-04 00:05:13 +01007020 if (PyUnicode_Resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007021 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01007022 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007023
7024error:
7025 Py_XDECREF(encoding_obj);
7026 Py_XDECREF(errorHandler);
7027 Py_XDECREF(exc);
7028 return ret;
7029}
7030
Victor Stinner3a50e702011-10-18 21:21:00 +02007031static PyObject *
7032decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007033 const char *s, Py_ssize_t size,
7034 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007035{
Victor Stinner76a31a62011-11-04 00:05:13 +01007036 PyObject *v = NULL;
7037 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007038
Victor Stinner3a50e702011-10-18 21:21:00 +02007039 if (code_page < 0) {
7040 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7041 return NULL;
7042 }
7043
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007044 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007045 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007046
Victor Stinner76a31a62011-11-04 00:05:13 +01007047 do
7048 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007049#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007050 if (size > INT_MAX) {
7051 chunk_size = INT_MAX;
7052 final = 0;
7053 done = 0;
7054 }
7055 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007056#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007057 {
7058 chunk_size = (int)size;
7059 final = (consumed == NULL);
7060 done = 1;
7061 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007062
Victor Stinner76a31a62011-11-04 00:05:13 +01007063 /* Skip trailing lead-byte unless 'final' is set */
7064 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
7065 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007066
Victor Stinner76a31a62011-11-04 00:05:13 +01007067 if (chunk_size == 0 && done) {
7068 if (v != NULL)
7069 break;
7070 Py_INCREF(unicode_empty);
7071 return unicode_empty;
7072 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007073
Victor Stinner76a31a62011-11-04 00:05:13 +01007074
7075 converted = decode_code_page_strict(code_page, &v,
7076 s, chunk_size);
7077 if (converted == -2)
7078 converted = decode_code_page_errors(code_page, &v,
7079 s, chunk_size,
7080 errors);
7081 assert(converted != 0);
7082
7083 if (converted < 0) {
7084 Py_XDECREF(v);
7085 return NULL;
7086 }
7087
7088 if (consumed)
7089 *consumed += converted;
7090
7091 s += converted;
7092 size -= converted;
7093 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007094
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007095 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007096}
7097
Alexander Belopolsky40018472011-02-26 01:02:56 +00007098PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007099PyUnicode_DecodeCodePageStateful(int code_page,
7100 const char *s,
7101 Py_ssize_t size,
7102 const char *errors,
7103 Py_ssize_t *consumed)
7104{
7105 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7106}
7107
7108PyObject *
7109PyUnicode_DecodeMBCSStateful(const char *s,
7110 Py_ssize_t size,
7111 const char *errors,
7112 Py_ssize_t *consumed)
7113{
7114 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7115}
7116
7117PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007118PyUnicode_DecodeMBCS(const char *s,
7119 Py_ssize_t size,
7120 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007121{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007122 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7123}
7124
Victor Stinner3a50e702011-10-18 21:21:00 +02007125static DWORD
7126encode_code_page_flags(UINT code_page, const char *errors)
7127{
7128 if (code_page == CP_UTF8) {
7129 if (winver.dwMajorVersion >= 6)
7130 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7131 and later */
7132 return WC_ERR_INVALID_CHARS;
7133 else
7134 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7135 return 0;
7136 }
7137 else if (code_page == CP_UTF7) {
7138 /* CP_UTF7 only supports flags=0 */
7139 return 0;
7140 }
7141 else {
7142 if (errors != NULL && strcmp(errors, "replace") == 0)
7143 return 0;
7144 else
7145 return WC_NO_BEST_FIT_CHARS;
7146 }
7147}
7148
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007149/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007150 * Encode a Unicode string to a Windows code page into a byte string in strict
7151 * mode.
7152 *
7153 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7154 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007155 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007156static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007157encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007158 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007159 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007160{
Victor Stinner554f3f02010-06-16 23:33:54 +00007161 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007162 BOOL *pusedDefaultChar = &usedDefaultChar;
7163 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007164 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007165 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007166 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007167 const DWORD flags = encode_code_page_flags(code_page, NULL);
7168 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007169 /* Create a substring so that we can get the UTF-16 representation
7170 of just the slice under consideration. */
7171 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007172
Martin v. Löwis3d325192011-11-04 18:23:06 +01007173 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007174
Victor Stinner3a50e702011-10-18 21:21:00 +02007175 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007176 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007177 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007178 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007179
Victor Stinner2fc507f2011-11-04 20:06:39 +01007180 substring = PyUnicode_Substring(unicode, offset, offset+len);
7181 if (substring == NULL)
7182 return -1;
7183 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7184 if (p == NULL) {
7185 Py_DECREF(substring);
7186 return -1;
7187 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01007188
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007189 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007190 outsize = WideCharToMultiByte(code_page, flags,
7191 p, size,
7192 NULL, 0,
7193 NULL, pusedDefaultChar);
7194 if (outsize <= 0)
7195 goto error;
7196 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007197 if (pusedDefaultChar && *pusedDefaultChar) {
7198 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007199 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007200 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007201
Victor Stinner3a50e702011-10-18 21:21:00 +02007202 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007203 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007204 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007205 if (*outbytes == NULL) {
7206 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007207 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007208 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007209 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007210 }
7211 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007212 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007213 const Py_ssize_t n = PyBytes_Size(*outbytes);
7214 if (outsize > PY_SSIZE_T_MAX - n) {
7215 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007216 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007217 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007218 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007219 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7220 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007221 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007222 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007223 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007224 }
7225
7226 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007227 outsize = WideCharToMultiByte(code_page, flags,
7228 p, size,
7229 out, outsize,
7230 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007231 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007232 if (outsize <= 0)
7233 goto error;
7234 if (pusedDefaultChar && *pusedDefaultChar)
7235 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007236 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007237
Victor Stinner3a50e702011-10-18 21:21:00 +02007238error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007239 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007240 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7241 return -2;
7242 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007243 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007244}
7245
Victor Stinner3a50e702011-10-18 21:21:00 +02007246/*
7247 * Encode a Unicode string to a Windows code page into a byte string using a
7248 * error handler.
7249 *
7250 * Returns consumed characters if succeed, or raise a WindowsError and returns
7251 * -1 on other error.
7252 */
7253static int
7254encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007255 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007256 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007257{
Victor Stinner3a50e702011-10-18 21:21:00 +02007258 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007259 Py_ssize_t pos = unicode_offset;
7260 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007261 /* Ideally, we should get reason from FormatMessage. This is the Windows
7262 2000 English version of the message. */
7263 const char *reason = "invalid character";
7264 /* 4=maximum length of a UTF-8 sequence */
7265 char buffer[4];
7266 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7267 Py_ssize_t outsize;
7268 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007269 PyObject *errorHandler = NULL;
7270 PyObject *exc = NULL;
7271 PyObject *encoding_obj = NULL;
7272 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007273 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007274 PyObject *rep;
7275 int ret = -1;
7276
7277 assert(insize > 0);
7278
7279 encoding = code_page_name(code_page, &encoding_obj);
7280 if (encoding == NULL)
7281 return -1;
7282
7283 if (errors == NULL || strcmp(errors, "strict") == 0) {
7284 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7285 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007286 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007287 if (exc != NULL) {
7288 PyCodec_StrictErrors(exc);
7289 Py_DECREF(exc);
7290 }
7291 Py_XDECREF(encoding_obj);
7292 return -1;
7293 }
7294
7295 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7296 pusedDefaultChar = &usedDefaultChar;
7297 else
7298 pusedDefaultChar = NULL;
7299
7300 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7301 PyErr_NoMemory();
7302 goto error;
7303 }
7304 outsize = insize * Py_ARRAY_LENGTH(buffer);
7305
7306 if (*outbytes == NULL) {
7307 /* Create string object */
7308 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7309 if (*outbytes == NULL)
7310 goto error;
7311 out = PyBytes_AS_STRING(*outbytes);
7312 }
7313 else {
7314 /* Extend string object */
7315 Py_ssize_t n = PyBytes_Size(*outbytes);
7316 if (n > PY_SSIZE_T_MAX - outsize) {
7317 PyErr_NoMemory();
7318 goto error;
7319 }
7320 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7321 goto error;
7322 out = PyBytes_AS_STRING(*outbytes) + n;
7323 }
7324
7325 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007326 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007327 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007328 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7329 wchar_t chars[2];
7330 int charsize;
7331 if (ch < 0x10000) {
7332 chars[0] = (wchar_t)ch;
7333 charsize = 1;
7334 }
7335 else {
7336 ch -= 0x10000;
7337 chars[0] = 0xd800 + (ch >> 10);
7338 chars[1] = 0xdc00 + (ch & 0x3ff);
7339 charsize = 2;
7340 }
7341
Victor Stinner3a50e702011-10-18 21:21:00 +02007342 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007343 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007344 buffer, Py_ARRAY_LENGTH(buffer),
7345 NULL, pusedDefaultChar);
7346 if (outsize > 0) {
7347 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7348 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007349 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007350 memcpy(out, buffer, outsize);
7351 out += outsize;
7352 continue;
7353 }
7354 }
7355 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7356 PyErr_SetFromWindowsErr(0);
7357 goto error;
7358 }
7359
Victor Stinner3a50e702011-10-18 21:21:00 +02007360 rep = unicode_encode_call_errorhandler(
7361 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007362 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007363 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007364 if (rep == NULL)
7365 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007366 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007367
7368 if (PyBytes_Check(rep)) {
7369 outsize = PyBytes_GET_SIZE(rep);
7370 if (outsize != 1) {
7371 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7372 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7373 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7374 Py_DECREF(rep);
7375 goto error;
7376 }
7377 out = PyBytes_AS_STRING(*outbytes) + offset;
7378 }
7379 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7380 out += outsize;
7381 }
7382 else {
7383 Py_ssize_t i;
7384 enum PyUnicode_Kind kind;
7385 void *data;
7386
7387 if (PyUnicode_READY(rep) < 0) {
7388 Py_DECREF(rep);
7389 goto error;
7390 }
7391
7392 outsize = PyUnicode_GET_LENGTH(rep);
7393 if (outsize != 1) {
7394 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7395 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7396 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7397 Py_DECREF(rep);
7398 goto error;
7399 }
7400 out = PyBytes_AS_STRING(*outbytes) + offset;
7401 }
7402 kind = PyUnicode_KIND(rep);
7403 data = PyUnicode_DATA(rep);
7404 for (i=0; i < outsize; i++) {
7405 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7406 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007407 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007408 encoding, unicode,
7409 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007410 "unable to encode error handler result to ASCII");
7411 Py_DECREF(rep);
7412 goto error;
7413 }
7414 *out = (unsigned char)ch;
7415 out++;
7416 }
7417 }
7418 Py_DECREF(rep);
7419 }
7420 /* write a NUL byte */
7421 *out = 0;
7422 outsize = out - PyBytes_AS_STRING(*outbytes);
7423 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7424 if (_PyBytes_Resize(outbytes, outsize) < 0)
7425 goto error;
7426 ret = 0;
7427
7428error:
7429 Py_XDECREF(encoding_obj);
7430 Py_XDECREF(errorHandler);
7431 Py_XDECREF(exc);
7432 return ret;
7433}
7434
Victor Stinner3a50e702011-10-18 21:21:00 +02007435static PyObject *
7436encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007437 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007438 const char *errors)
7439{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007440 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007441 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007442 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007443 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007444
Victor Stinner2fc507f2011-11-04 20:06:39 +01007445 if (PyUnicode_READY(unicode) < 0)
7446 return NULL;
7447 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007448
Victor Stinner3a50e702011-10-18 21:21:00 +02007449 if (code_page < 0) {
7450 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7451 return NULL;
7452 }
7453
Martin v. Löwis3d325192011-11-04 18:23:06 +01007454 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007455 return PyBytes_FromStringAndSize(NULL, 0);
7456
Victor Stinner7581cef2011-11-03 22:32:33 +01007457 offset = 0;
7458 do
7459 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007460#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007461 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007462 chunks. */
7463 if (len > INT_MAX/2) {
7464 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007465 done = 0;
7466 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007467 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007468#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007469 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007470 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007471 done = 1;
7472 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007473
Victor Stinner76a31a62011-11-04 00:05:13 +01007474 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007475 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007476 errors);
7477 if (ret == -2)
7478 ret = encode_code_page_errors(code_page, &outbytes,
7479 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007480 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007481 if (ret < 0) {
7482 Py_XDECREF(outbytes);
7483 return NULL;
7484 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007485
Victor Stinner7581cef2011-11-03 22:32:33 +01007486 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007487 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007488 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007489
Victor Stinner3a50e702011-10-18 21:21:00 +02007490 return outbytes;
7491}
7492
7493PyObject *
7494PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7495 Py_ssize_t size,
7496 const char *errors)
7497{
Victor Stinner7581cef2011-11-03 22:32:33 +01007498 PyObject *unicode, *res;
7499 unicode = PyUnicode_FromUnicode(p, size);
7500 if (unicode == NULL)
7501 return NULL;
7502 res = encode_code_page(CP_ACP, unicode, errors);
7503 Py_DECREF(unicode);
7504 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007505}
7506
7507PyObject *
7508PyUnicode_EncodeCodePage(int code_page,
7509 PyObject *unicode,
7510 const char *errors)
7511{
Victor Stinner7581cef2011-11-03 22:32:33 +01007512 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007513}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007514
Alexander Belopolsky40018472011-02-26 01:02:56 +00007515PyObject *
7516PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007517{
7518 if (!PyUnicode_Check(unicode)) {
7519 PyErr_BadArgument();
7520 return NULL;
7521 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007522 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007523}
7524
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007525#undef NEED_RETRY
7526
Victor Stinner99b95382011-07-04 14:23:54 +02007527#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007528
Guido van Rossumd57fd912000-03-10 22:53:23 +00007529/* --- Character Mapping Codec -------------------------------------------- */
7530
Alexander Belopolsky40018472011-02-26 01:02:56 +00007531PyObject *
7532PyUnicode_DecodeCharmap(const char *s,
7533 Py_ssize_t size,
7534 PyObject *mapping,
7535 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007536{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007537 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007538 Py_ssize_t startinpos;
7539 Py_ssize_t endinpos;
7540 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007541 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007542 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007543 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007544 PyObject *errorHandler = NULL;
7545 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007546
Guido van Rossumd57fd912000-03-10 22:53:23 +00007547 /* Default to Latin-1 */
7548 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007549 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007550
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007551 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007552 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007553 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007554 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007555 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007556 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007557 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007558 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007559 Py_ssize_t maplen;
7560 enum PyUnicode_Kind kind;
7561 void *data;
7562 Py_UCS4 x;
7563
7564 if (PyUnicode_READY(mapping) < 0)
7565 return NULL;
7566
7567 maplen = PyUnicode_GET_LENGTH(mapping);
7568 data = PyUnicode_DATA(mapping);
7569 kind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007570 while (s < e) {
7571 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007572
Benjamin Peterson29060642009-01-31 22:14:21 +00007573 if (ch < maplen)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007574 x = PyUnicode_READ(kind, data, ch);
7575 else
7576 x = 0xfffe; /* invalid value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007577
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007578 if (x == 0xfffe)
7579 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007580 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007581 startinpos = s-starts;
7582 endinpos = startinpos+1;
7583 if (unicode_decode_call_errorhandler(
7584 errors, &errorHandler,
7585 "charmap", "character maps to <undefined>",
7586 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007587 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007588 goto onError;
7589 }
7590 continue;
7591 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007592
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007593 if (unicode_putchar(&v, &outpos, x) < 0)
7594 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007595 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007596 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007597 }
7598 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007599 while (s < e) {
7600 unsigned char ch = *s;
7601 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007602
Benjamin Peterson29060642009-01-31 22:14:21 +00007603 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7604 w = PyLong_FromLong((long)ch);
7605 if (w == NULL)
7606 goto onError;
7607 x = PyObject_GetItem(mapping, w);
7608 Py_DECREF(w);
7609 if (x == NULL) {
7610 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7611 /* No mapping found means: mapping is undefined. */
7612 PyErr_Clear();
7613 x = Py_None;
7614 Py_INCREF(x);
7615 } else
7616 goto onError;
7617 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007618
Benjamin Peterson29060642009-01-31 22:14:21 +00007619 /* Apply mapping */
7620 if (PyLong_Check(x)) {
7621 long value = PyLong_AS_LONG(x);
7622 if (value < 0 || value > 65535) {
7623 PyErr_SetString(PyExc_TypeError,
7624 "character mapping must be in range(65536)");
7625 Py_DECREF(x);
7626 goto onError;
7627 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007628 if (unicode_putchar(&v, &outpos, value) < 0)
7629 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007630 }
7631 else if (x == Py_None) {
7632 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007633 startinpos = s-starts;
7634 endinpos = startinpos+1;
7635 if (unicode_decode_call_errorhandler(
7636 errors, &errorHandler,
7637 "charmap", "character maps to <undefined>",
7638 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007639 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007640 Py_DECREF(x);
7641 goto onError;
7642 }
7643 Py_DECREF(x);
7644 continue;
7645 }
7646 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007647 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007648
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007649 if (PyUnicode_READY(x) < 0)
7650 goto onError;
7651 targetsize = PyUnicode_GET_LENGTH(x);
7652
7653 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007654 /* 1-1 mapping */
Victor Stinner62aa4d02011-11-09 00:03:45 +01007655 if (unicode_putchar(&v, &outpos,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007656 PyUnicode_READ_CHAR(x, 0)) < 0)
7657 goto onError;
7658 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007659 else if (targetsize > 1) {
7660 /* 1-n mapping */
7661 if (targetsize > extrachars) {
7662 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007663 Py_ssize_t needed = (targetsize - extrachars) + \
7664 (targetsize << 2);
7665 extrachars += needed;
7666 /* XXX overflow detection missing */
Victor Stinner7931d9a2011-11-04 00:22:48 +01007667 if (PyUnicode_Resize(&v,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007668 PyUnicode_GET_LENGTH(v) + needed) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007669 Py_DECREF(x);
7670 goto onError;
7671 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007672 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007673 if (unicode_widen(&v, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
7674 goto onError;
7675 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7676 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00007677 extrachars -= targetsize;
7678 }
7679 /* 1-0 mapping: skip the character */
7680 }
7681 else {
7682 /* wrong return value */
7683 PyErr_SetString(PyExc_TypeError,
7684 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007685 Py_DECREF(x);
7686 goto onError;
7687 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007688 Py_DECREF(x);
7689 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007690 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007691 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007692 if (PyUnicode_Resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01007693 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007694 Py_XDECREF(errorHandler);
7695 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007696 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00007697
Benjamin Peterson29060642009-01-31 22:14:21 +00007698 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007699 Py_XDECREF(errorHandler);
7700 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007701 Py_XDECREF(v);
7702 return NULL;
7703}
7704
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007705/* Charmap encoding: the lookup table */
7706
Alexander Belopolsky40018472011-02-26 01:02:56 +00007707struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007708 PyObject_HEAD
7709 unsigned char level1[32];
7710 int count2, count3;
7711 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007712};
7713
7714static PyObject*
7715encoding_map_size(PyObject *obj, PyObject* args)
7716{
7717 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007718 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007719 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007720}
7721
7722static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007723 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007724 PyDoc_STR("Return the size (in bytes) of this object") },
7725 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007726};
7727
7728static void
7729encoding_map_dealloc(PyObject* o)
7730{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007731 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007732}
7733
7734static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007735 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007736 "EncodingMap", /*tp_name*/
7737 sizeof(struct encoding_map), /*tp_basicsize*/
7738 0, /*tp_itemsize*/
7739 /* methods */
7740 encoding_map_dealloc, /*tp_dealloc*/
7741 0, /*tp_print*/
7742 0, /*tp_getattr*/
7743 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007744 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007745 0, /*tp_repr*/
7746 0, /*tp_as_number*/
7747 0, /*tp_as_sequence*/
7748 0, /*tp_as_mapping*/
7749 0, /*tp_hash*/
7750 0, /*tp_call*/
7751 0, /*tp_str*/
7752 0, /*tp_getattro*/
7753 0, /*tp_setattro*/
7754 0, /*tp_as_buffer*/
7755 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7756 0, /*tp_doc*/
7757 0, /*tp_traverse*/
7758 0, /*tp_clear*/
7759 0, /*tp_richcompare*/
7760 0, /*tp_weaklistoffset*/
7761 0, /*tp_iter*/
7762 0, /*tp_iternext*/
7763 encoding_map_methods, /*tp_methods*/
7764 0, /*tp_members*/
7765 0, /*tp_getset*/
7766 0, /*tp_base*/
7767 0, /*tp_dict*/
7768 0, /*tp_descr_get*/
7769 0, /*tp_descr_set*/
7770 0, /*tp_dictoffset*/
7771 0, /*tp_init*/
7772 0, /*tp_alloc*/
7773 0, /*tp_new*/
7774 0, /*tp_free*/
7775 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007776};
7777
7778PyObject*
7779PyUnicode_BuildEncodingMap(PyObject* string)
7780{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007781 PyObject *result;
7782 struct encoding_map *mresult;
7783 int i;
7784 int need_dict = 0;
7785 unsigned char level1[32];
7786 unsigned char level2[512];
7787 unsigned char *mlevel1, *mlevel2, *mlevel3;
7788 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007789 int kind;
7790 void *data;
7791 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007792
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007793 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007794 PyErr_BadArgument();
7795 return NULL;
7796 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007797 kind = PyUnicode_KIND(string);
7798 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007799 memset(level1, 0xFF, sizeof level1);
7800 memset(level2, 0xFF, sizeof level2);
7801
7802 /* If there isn't a one-to-one mapping of NULL to \0,
7803 or if there are non-BMP characters, we need to use
7804 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007805 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007806 need_dict = 1;
7807 for (i = 1; i < 256; i++) {
7808 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007809 ch = PyUnicode_READ(kind, data, i);
7810 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007811 need_dict = 1;
7812 break;
7813 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007814 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007815 /* unmapped character */
7816 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007817 l1 = ch >> 11;
7818 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007819 if (level1[l1] == 0xFF)
7820 level1[l1] = count2++;
7821 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007822 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007823 }
7824
7825 if (count2 >= 0xFF || count3 >= 0xFF)
7826 need_dict = 1;
7827
7828 if (need_dict) {
7829 PyObject *result = PyDict_New();
7830 PyObject *key, *value;
7831 if (!result)
7832 return NULL;
7833 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007834 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007835 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007836 if (!key || !value)
7837 goto failed1;
7838 if (PyDict_SetItem(result, key, value) == -1)
7839 goto failed1;
7840 Py_DECREF(key);
7841 Py_DECREF(value);
7842 }
7843 return result;
7844 failed1:
7845 Py_XDECREF(key);
7846 Py_XDECREF(value);
7847 Py_DECREF(result);
7848 return NULL;
7849 }
7850
7851 /* Create a three-level trie */
7852 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7853 16*count2 + 128*count3 - 1);
7854 if (!result)
7855 return PyErr_NoMemory();
7856 PyObject_Init(result, &EncodingMapType);
7857 mresult = (struct encoding_map*)result;
7858 mresult->count2 = count2;
7859 mresult->count3 = count3;
7860 mlevel1 = mresult->level1;
7861 mlevel2 = mresult->level23;
7862 mlevel3 = mresult->level23 + 16*count2;
7863 memcpy(mlevel1, level1, 32);
7864 memset(mlevel2, 0xFF, 16*count2);
7865 memset(mlevel3, 0, 128*count3);
7866 count3 = 0;
7867 for (i = 1; i < 256; i++) {
7868 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007869 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007870 /* unmapped character */
7871 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007872 o1 = PyUnicode_READ(kind, data, i)>>11;
7873 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007874 i2 = 16*mlevel1[o1] + o2;
7875 if (mlevel2[i2] == 0xFF)
7876 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007877 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007878 i3 = 128*mlevel2[i2] + o3;
7879 mlevel3[i3] = i;
7880 }
7881 return result;
7882}
7883
7884static int
Victor Stinner22168992011-11-20 17:09:18 +01007885encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007886{
7887 struct encoding_map *map = (struct encoding_map*)mapping;
7888 int l1 = c>>11;
7889 int l2 = (c>>7) & 0xF;
7890 int l3 = c & 0x7F;
7891 int i;
7892
Victor Stinner22168992011-11-20 17:09:18 +01007893 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007894 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007895 if (c == 0)
7896 return 0;
7897 /* level 1*/
7898 i = map->level1[l1];
7899 if (i == 0xFF) {
7900 return -1;
7901 }
7902 /* level 2*/
7903 i = map->level23[16*i+l2];
7904 if (i == 0xFF) {
7905 return -1;
7906 }
7907 /* level 3 */
7908 i = map->level23[16*map->count2 + 128*i + l3];
7909 if (i == 0) {
7910 return -1;
7911 }
7912 return i;
7913}
7914
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007915/* Lookup the character ch in the mapping. If the character
7916 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007917 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007918static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007919charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007920{
Christian Heimes217cfd12007-12-02 14:31:20 +00007921 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007922 PyObject *x;
7923
7924 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007925 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007926 x = PyObject_GetItem(mapping, w);
7927 Py_DECREF(w);
7928 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007929 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7930 /* No mapping found means: mapping is undefined. */
7931 PyErr_Clear();
7932 x = Py_None;
7933 Py_INCREF(x);
7934 return x;
7935 } else
7936 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007937 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007938 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007939 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007940 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007941 long value = PyLong_AS_LONG(x);
7942 if (value < 0 || value > 255) {
7943 PyErr_SetString(PyExc_TypeError,
7944 "character mapping must be in range(256)");
7945 Py_DECREF(x);
7946 return NULL;
7947 }
7948 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007949 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007950 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007951 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007952 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007953 /* wrong return value */
7954 PyErr_Format(PyExc_TypeError,
7955 "character mapping must return integer, bytes or None, not %.400s",
7956 x->ob_type->tp_name);
7957 Py_DECREF(x);
7958 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007959 }
7960}
7961
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007962static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007963charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007964{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007965 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7966 /* exponentially overallocate to minimize reallocations */
7967 if (requiredsize < 2*outsize)
7968 requiredsize = 2*outsize;
7969 if (_PyBytes_Resize(outobj, requiredsize))
7970 return -1;
7971 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007972}
7973
Benjamin Peterson14339b62009-01-31 16:36:08 +00007974typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007975 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007976} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007977/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007978 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007979 space is available. Return a new reference to the object that
7980 was put in the output buffer, or Py_None, if the mapping was undefined
7981 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007982 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007983static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01007984charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007985 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007986{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007987 PyObject *rep;
7988 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007989 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007990
Christian Heimes90aa7642007-12-19 02:45:37 +00007991 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007992 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007993 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007994 if (res == -1)
7995 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007996 if (outsize<requiredsize)
7997 if (charmapencode_resize(outobj, outpos, requiredsize))
7998 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007999 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008000 outstart[(*outpos)++] = (char)res;
8001 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008002 }
8003
8004 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008005 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008006 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008007 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008008 Py_DECREF(rep);
8009 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008010 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008011 if (PyLong_Check(rep)) {
8012 Py_ssize_t requiredsize = *outpos+1;
8013 if (outsize<requiredsize)
8014 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8015 Py_DECREF(rep);
8016 return enc_EXCEPTION;
8017 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008018 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008019 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008020 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008021 else {
8022 const char *repchars = PyBytes_AS_STRING(rep);
8023 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8024 Py_ssize_t requiredsize = *outpos+repsize;
8025 if (outsize<requiredsize)
8026 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8027 Py_DECREF(rep);
8028 return enc_EXCEPTION;
8029 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008030 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008031 memcpy(outstart + *outpos, repchars, repsize);
8032 *outpos += repsize;
8033 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008034 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008035 Py_DECREF(rep);
8036 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008037}
8038
8039/* handle an error in PyUnicode_EncodeCharmap
8040 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008041static int
8042charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008043 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008044 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008045 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008046 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008047{
8048 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008049 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008050 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008051 enum PyUnicode_Kind kind;
8052 void *data;
8053 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008054 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008055 Py_ssize_t collstartpos = *inpos;
8056 Py_ssize_t collendpos = *inpos+1;
8057 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008058 char *encoding = "charmap";
8059 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008060 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008061 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008062 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008063
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008064 if (PyUnicode_READY(unicode) < 0)
8065 return -1;
8066 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008067 /* find all unencodable characters */
8068 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008069 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008070 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008071 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008072 val = encoding_map_lookup(ch, mapping);
8073 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008074 break;
8075 ++collendpos;
8076 continue;
8077 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008078
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008079 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8080 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008081 if (rep==NULL)
8082 return -1;
8083 else if (rep!=Py_None) {
8084 Py_DECREF(rep);
8085 break;
8086 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008087 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008088 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008089 }
8090 /* cache callback name lookup
8091 * (if not done yet, i.e. it's the first error) */
8092 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008093 if ((errors==NULL) || (!strcmp(errors, "strict")))
8094 *known_errorHandler = 1;
8095 else if (!strcmp(errors, "replace"))
8096 *known_errorHandler = 2;
8097 else if (!strcmp(errors, "ignore"))
8098 *known_errorHandler = 3;
8099 else if (!strcmp(errors, "xmlcharrefreplace"))
8100 *known_errorHandler = 4;
8101 else
8102 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008103 }
8104 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008105 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008106 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008107 return -1;
8108 case 2: /* replace */
8109 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008110 x = charmapencode_output('?', mapping, res, respos);
8111 if (x==enc_EXCEPTION) {
8112 return -1;
8113 }
8114 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008115 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008116 return -1;
8117 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008118 }
8119 /* fall through */
8120 case 3: /* ignore */
8121 *inpos = collendpos;
8122 break;
8123 case 4: /* xmlcharrefreplace */
8124 /* generate replacement (temporarily (mis)uses p) */
8125 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008126 char buffer[2+29+1+1];
8127 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008128 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008129 for (cp = buffer; *cp; ++cp) {
8130 x = charmapencode_output(*cp, mapping, res, respos);
8131 if (x==enc_EXCEPTION)
8132 return -1;
8133 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008134 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008135 return -1;
8136 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008137 }
8138 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008139 *inpos = collendpos;
8140 break;
8141 default:
8142 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008143 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008144 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008145 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008146 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008147 if (PyBytes_Check(repunicode)) {
8148 /* Directly copy bytes result to output. */
8149 Py_ssize_t outsize = PyBytes_Size(*res);
8150 Py_ssize_t requiredsize;
8151 repsize = PyBytes_Size(repunicode);
8152 requiredsize = *respos + repsize;
8153 if (requiredsize > outsize)
8154 /* Make room for all additional bytes. */
8155 if (charmapencode_resize(res, respos, requiredsize)) {
8156 Py_DECREF(repunicode);
8157 return -1;
8158 }
8159 memcpy(PyBytes_AsString(*res) + *respos,
8160 PyBytes_AsString(repunicode), repsize);
8161 *respos += repsize;
8162 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008163 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008164 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008165 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008166 /* generate replacement */
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008167 if (PyUnicode_READY(repunicode) < 0) {
8168 Py_DECREF(repunicode);
8169 return -1;
8170 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008171 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008172 data = PyUnicode_DATA(repunicode);
8173 kind = PyUnicode_KIND(repunicode);
8174 for (index = 0; index < repsize; index++) {
8175 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8176 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008177 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008178 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008179 return -1;
8180 }
8181 else if (x==enc_FAILED) {
8182 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008183 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008184 return -1;
8185 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008186 }
8187 *inpos = newpos;
8188 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008189 }
8190 return 0;
8191}
8192
Alexander Belopolsky40018472011-02-26 01:02:56 +00008193PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008194_PyUnicode_EncodeCharmap(PyObject *unicode,
8195 PyObject *mapping,
8196 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008197{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008198 /* output object */
8199 PyObject *res = NULL;
8200 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008201 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008202 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008203 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008204 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008205 PyObject *errorHandler = NULL;
8206 PyObject *exc = NULL;
8207 /* the following variable is used for caching string comparisons
8208 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8209 * 3=ignore, 4=xmlcharrefreplace */
8210 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008211
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008212 if (PyUnicode_READY(unicode) < 0)
8213 return NULL;
8214 size = PyUnicode_GET_LENGTH(unicode);
8215
Guido van Rossumd57fd912000-03-10 22:53:23 +00008216 /* Default to Latin-1 */
8217 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008218 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008219
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008220 /* allocate enough for a simple encoding without
8221 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008222 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008223 if (res == NULL)
8224 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008225 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008226 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008227
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008228 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008229 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008230 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008231 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008232 if (x==enc_EXCEPTION) /* error */
8233 goto onError;
8234 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008235 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008236 &exc,
8237 &known_errorHandler, &errorHandler, errors,
8238 &res, &respos)) {
8239 goto onError;
8240 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008241 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008242 else
8243 /* done with this character => adjust input position */
8244 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008245 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008246
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008247 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008248 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008249 if (_PyBytes_Resize(&res, respos) < 0)
8250 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008251
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008252 Py_XDECREF(exc);
8253 Py_XDECREF(errorHandler);
8254 return res;
8255
Benjamin Peterson29060642009-01-31 22:14:21 +00008256 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008257 Py_XDECREF(res);
8258 Py_XDECREF(exc);
8259 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008260 return NULL;
8261}
8262
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008263/* Deprecated */
8264PyObject *
8265PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8266 Py_ssize_t size,
8267 PyObject *mapping,
8268 const char *errors)
8269{
8270 PyObject *result;
8271 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8272 if (unicode == NULL)
8273 return NULL;
8274 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8275 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008276 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008277}
8278
Alexander Belopolsky40018472011-02-26 01:02:56 +00008279PyObject *
8280PyUnicode_AsCharmapString(PyObject *unicode,
8281 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008282{
8283 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008284 PyErr_BadArgument();
8285 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008286 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008287 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008288}
8289
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008290/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008291static void
8292make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008293 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008294 Py_ssize_t startpos, Py_ssize_t endpos,
8295 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008296{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008297 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008298 *exceptionObject = _PyUnicodeTranslateError_Create(
8299 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008300 }
8301 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008302 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8303 goto onError;
8304 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8305 goto onError;
8306 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8307 goto onError;
8308 return;
8309 onError:
8310 Py_DECREF(*exceptionObject);
8311 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008312 }
8313}
8314
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008315/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008316static void
8317raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008318 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008319 Py_ssize_t startpos, Py_ssize_t endpos,
8320 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008321{
8322 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008323 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008324 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008325 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008326}
8327
8328/* error handling callback helper:
8329 build arguments, call the callback and check the arguments,
8330 put the result into newpos and return the replacement string, which
8331 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008332static PyObject *
8333unicode_translate_call_errorhandler(const char *errors,
8334 PyObject **errorHandler,
8335 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008336 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008337 Py_ssize_t startpos, Py_ssize_t endpos,
8338 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008339{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008340 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008341
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008342 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008343 PyObject *restuple;
8344 PyObject *resunicode;
8345
8346 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008347 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008348 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008349 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008350 }
8351
8352 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008353 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008354 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008355 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008356
8357 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008358 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008359 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008360 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008361 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008362 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008363 Py_DECREF(restuple);
8364 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008365 }
8366 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008367 &resunicode, &i_newpos)) {
8368 Py_DECREF(restuple);
8369 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008370 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008371 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008372 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008373 else
8374 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008375 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008376 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8377 Py_DECREF(restuple);
8378 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008379 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008380 Py_INCREF(resunicode);
8381 Py_DECREF(restuple);
8382 return resunicode;
8383}
8384
8385/* Lookup the character ch in the mapping and put the result in result,
8386 which must be decrefed by the caller.
8387 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008388static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008389charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008390{
Christian Heimes217cfd12007-12-02 14:31:20 +00008391 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008392 PyObject *x;
8393
8394 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008395 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008396 x = PyObject_GetItem(mapping, w);
8397 Py_DECREF(w);
8398 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008399 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8400 /* No mapping found means: use 1:1 mapping. */
8401 PyErr_Clear();
8402 *result = NULL;
8403 return 0;
8404 } else
8405 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008406 }
8407 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008408 *result = x;
8409 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008410 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008411 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008412 long value = PyLong_AS_LONG(x);
8413 long max = PyUnicode_GetMax();
8414 if (value < 0 || value > max) {
8415 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008416 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008417 Py_DECREF(x);
8418 return -1;
8419 }
8420 *result = x;
8421 return 0;
8422 }
8423 else if (PyUnicode_Check(x)) {
8424 *result = x;
8425 return 0;
8426 }
8427 else {
8428 /* wrong return value */
8429 PyErr_SetString(PyExc_TypeError,
8430 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008431 Py_DECREF(x);
8432 return -1;
8433 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008434}
8435/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008436 if not reallocate and adjust various state variables.
8437 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008438static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008439charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008440 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008441{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008442 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00008443 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008444 /* exponentially overallocate to minimize reallocations */
8445 if (requiredsize < 2 * oldsize)
8446 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008447 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8448 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008449 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008450 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008451 }
8452 return 0;
8453}
8454/* lookup the character, put the result in the output string and adjust
8455 various state variables. Return a new reference to the object that
8456 was put in the output buffer in *result, or Py_None, if the mapping was
8457 undefined (in which case no character was written).
8458 The called must decref result.
8459 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008460static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008461charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8462 PyObject *mapping, Py_UCS4 **output,
8463 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008464 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008465{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008466 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8467 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008468 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008469 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008470 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008471 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008472 }
8473 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008474 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008475 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008476 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008477 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008478 }
8479 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008480 Py_ssize_t repsize;
8481 if (PyUnicode_READY(*res) == -1)
8482 return -1;
8483 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008484 if (repsize==1) {
8485 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008486 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008487 }
8488 else if (repsize!=0) {
8489 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008490 Py_ssize_t requiredsize = *opos +
8491 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008492 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008493 Py_ssize_t i;
8494 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008495 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008496 for(i = 0; i < repsize; i++)
8497 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008498 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008499 }
8500 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008501 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008502 return 0;
8503}
8504
Alexander Belopolsky40018472011-02-26 01:02:56 +00008505PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008506_PyUnicode_TranslateCharmap(PyObject *input,
8507 PyObject *mapping,
8508 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008509{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008510 /* input object */
8511 char *idata;
8512 Py_ssize_t size, i;
8513 int kind;
8514 /* output buffer */
8515 Py_UCS4 *output = NULL;
8516 Py_ssize_t osize;
8517 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008518 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008519 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008520 char *reason = "character maps to <undefined>";
8521 PyObject *errorHandler = NULL;
8522 PyObject *exc = NULL;
8523 /* the following variable is used for caching string comparisons
8524 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8525 * 3=ignore, 4=xmlcharrefreplace */
8526 int known_errorHandler = -1;
8527
Guido van Rossumd57fd912000-03-10 22:53:23 +00008528 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008529 PyErr_BadArgument();
8530 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008531 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008532
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008533 if (PyUnicode_READY(input) == -1)
8534 return NULL;
8535 idata = (char*)PyUnicode_DATA(input);
8536 kind = PyUnicode_KIND(input);
8537 size = PyUnicode_GET_LENGTH(input);
8538 i = 0;
8539
8540 if (size == 0) {
8541 Py_INCREF(input);
8542 return input;
8543 }
8544
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008545 /* allocate enough for a simple 1:1 translation without
8546 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008547 osize = size;
8548 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8549 opos = 0;
8550 if (output == NULL) {
8551 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008552 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008553 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008554
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008555 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008556 /* try to encode it */
8557 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008558 if (charmaptranslate_output(input, i, mapping,
8559 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008560 Py_XDECREF(x);
8561 goto onError;
8562 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008563 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008564 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008565 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008566 else { /* untranslatable character */
8567 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8568 Py_ssize_t repsize;
8569 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008570 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008571 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008572 Py_ssize_t collstart = i;
8573 Py_ssize_t collend = i+1;
8574 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008575
Benjamin Peterson29060642009-01-31 22:14:21 +00008576 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008577 while (collend < size) {
8578 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008579 goto onError;
8580 Py_XDECREF(x);
8581 if (x!=Py_None)
8582 break;
8583 ++collend;
8584 }
8585 /* cache callback name lookup
8586 * (if not done yet, i.e. it's the first error) */
8587 if (known_errorHandler==-1) {
8588 if ((errors==NULL) || (!strcmp(errors, "strict")))
8589 known_errorHandler = 1;
8590 else if (!strcmp(errors, "replace"))
8591 known_errorHandler = 2;
8592 else if (!strcmp(errors, "ignore"))
8593 known_errorHandler = 3;
8594 else if (!strcmp(errors, "xmlcharrefreplace"))
8595 known_errorHandler = 4;
8596 else
8597 known_errorHandler = 0;
8598 }
8599 switch (known_errorHandler) {
8600 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008601 raise_translate_exception(&exc, input, collstart,
8602 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008603 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008604 case 2: /* replace */
8605 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008606 for (coll = collstart; coll<collend; coll++)
8607 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008608 /* fall through */
8609 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008610 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008611 break;
8612 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008613 /* generate replacement (temporarily (mis)uses i) */
8614 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008615 char buffer[2+29+1+1];
8616 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008617 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8618 if (charmaptranslate_makespace(&output, &osize,
8619 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008620 goto onError;
8621 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008622 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008623 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008624 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008625 break;
8626 default:
8627 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008628 reason, input, &exc,
8629 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008630 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008631 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008632 if (PyUnicode_READY(repunicode) < 0) {
8633 Py_DECREF(repunicode);
8634 goto onError;
8635 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008636 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008637 repsize = PyUnicode_GET_LENGTH(repunicode);
8638 if (charmaptranslate_makespace(&output, &osize,
8639 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008640 Py_DECREF(repunicode);
8641 goto onError;
8642 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008643 for (uni2 = 0; repsize-->0; ++uni2)
8644 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8645 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008646 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008647 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008648 }
8649 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008650 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8651 if (!res)
8652 goto onError;
8653 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008654 Py_XDECREF(exc);
8655 Py_XDECREF(errorHandler);
8656 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008657
Benjamin Peterson29060642009-01-31 22:14:21 +00008658 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008659 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008660 Py_XDECREF(exc);
8661 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008662 return NULL;
8663}
8664
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008665/* Deprecated. Use PyUnicode_Translate instead. */
8666PyObject *
8667PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8668 Py_ssize_t size,
8669 PyObject *mapping,
8670 const char *errors)
8671{
8672 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8673 if (!unicode)
8674 return NULL;
8675 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8676}
8677
Alexander Belopolsky40018472011-02-26 01:02:56 +00008678PyObject *
8679PyUnicode_Translate(PyObject *str,
8680 PyObject *mapping,
8681 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008682{
8683 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008684
Guido van Rossumd57fd912000-03-10 22:53:23 +00008685 str = PyUnicode_FromObject(str);
8686 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008687 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008688 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008689 Py_DECREF(str);
8690 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008691
Benjamin Peterson29060642009-01-31 22:14:21 +00008692 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008693 Py_XDECREF(str);
8694 return NULL;
8695}
Tim Petersced69f82003-09-16 20:30:58 +00008696
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008697static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008698fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008699{
8700 /* No need to call PyUnicode_READY(self) because this function is only
8701 called as a callback from fixup() which does it already. */
8702 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8703 const int kind = PyUnicode_KIND(self);
8704 void *data = PyUnicode_DATA(self);
8705 Py_UCS4 maxchar = 0, ch, fixed;
8706 Py_ssize_t i;
8707
8708 for (i = 0; i < len; ++i) {
8709 ch = PyUnicode_READ(kind, data, i);
8710 fixed = 0;
8711 if (ch > 127) {
8712 if (Py_UNICODE_ISSPACE(ch))
8713 fixed = ' ';
8714 else {
8715 const int decimal = Py_UNICODE_TODECIMAL(ch);
8716 if (decimal >= 0)
8717 fixed = '0' + decimal;
8718 }
8719 if (fixed != 0) {
8720 if (fixed > maxchar)
8721 maxchar = fixed;
8722 PyUnicode_WRITE(kind, data, i, fixed);
8723 }
8724 else if (ch > maxchar)
8725 maxchar = ch;
8726 }
8727 else if (ch > maxchar)
8728 maxchar = ch;
8729 }
8730
8731 return maxchar;
8732}
8733
8734PyObject *
8735_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8736{
8737 if (!PyUnicode_Check(unicode)) {
8738 PyErr_BadInternalCall();
8739 return NULL;
8740 }
8741 if (PyUnicode_READY(unicode) == -1)
8742 return NULL;
8743 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8744 /* If the string is already ASCII, just return the same string */
8745 Py_INCREF(unicode);
8746 return unicode;
8747 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008748 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008749}
8750
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008751PyObject *
8752PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8753 Py_ssize_t length)
8754{
Victor Stinnerf0124502011-11-21 23:12:56 +01008755 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008756 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008757 Py_UCS4 maxchar;
8758 enum PyUnicode_Kind kind;
8759 void *data;
8760
8761 maxchar = 0;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008762 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008763 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008764 if (ch > 127) {
8765 int decimal = Py_UNICODE_TODECIMAL(ch);
8766 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008767 ch = '0' + decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008768 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008769 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008770 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008771
8772 /* Copy to a new string */
8773 decimal = PyUnicode_New(length, maxchar);
8774 if (decimal == NULL)
8775 return decimal;
8776 kind = PyUnicode_KIND(decimal);
8777 data = PyUnicode_DATA(decimal);
8778 /* Iterate over code points */
8779 for (i = 0; i < length; i++) {
8780 Py_UNICODE ch = s[i];
8781 if (ch > 127) {
8782 int decimal = Py_UNICODE_TODECIMAL(ch);
8783 if (decimal >= 0)
8784 ch = '0' + decimal;
8785 }
8786 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008787 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008788 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008789}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008790/* --- Decimal Encoder ---------------------------------------------------- */
8791
Alexander Belopolsky40018472011-02-26 01:02:56 +00008792int
8793PyUnicode_EncodeDecimal(Py_UNICODE *s,
8794 Py_ssize_t length,
8795 char *output,
8796 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008797{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008798 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008799 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008800 enum PyUnicode_Kind kind;
8801 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008802
8803 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008804 PyErr_BadArgument();
8805 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008806 }
8807
Victor Stinner42bf7752011-11-21 22:52:58 +01008808 unicode = PyUnicode_FromUnicode(s, length);
8809 if (unicode == NULL)
8810 return -1;
8811
Victor Stinner6345be92011-11-25 20:09:01 +01008812 if (PyUnicode_READY(unicode) < 0) {
8813 Py_DECREF(unicode);
8814 return -1;
8815 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008816 kind = PyUnicode_KIND(unicode);
8817 data = PyUnicode_DATA(unicode);
8818
Victor Stinnerb84d7232011-11-22 01:50:07 +01008819 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008820 PyObject *exc;
8821 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008822 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008823 Py_ssize_t startpos;
8824
8825 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008826
Benjamin Peterson29060642009-01-31 22:14:21 +00008827 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008828 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008829 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008830 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008831 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008832 decimal = Py_UNICODE_TODECIMAL(ch);
8833 if (decimal >= 0) {
8834 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008835 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008836 continue;
8837 }
8838 if (0 < ch && ch < 256) {
8839 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008840 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008841 continue;
8842 }
Victor Stinner6345be92011-11-25 20:09:01 +01008843
Victor Stinner42bf7752011-11-21 22:52:58 +01008844 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008845 exc = NULL;
8846 raise_encode_exception(&exc, "decimal", unicode,
8847 startpos, startpos+1,
8848 "invalid decimal Unicode string");
8849 Py_XDECREF(exc);
8850 Py_DECREF(unicode);
8851 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008852 }
8853 /* 0-terminate the output string */
8854 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008855 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008856 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008857}
8858
Guido van Rossumd57fd912000-03-10 22:53:23 +00008859/* --- Helpers ------------------------------------------------------------ */
8860
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008861static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008862any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008863 Py_ssize_t start,
8864 Py_ssize_t end)
8865{
8866 int kind1, kind2, kind;
8867 void *buf1, *buf2;
8868 Py_ssize_t len1, len2, result;
8869
8870 kind1 = PyUnicode_KIND(s1);
8871 kind2 = PyUnicode_KIND(s2);
8872 kind = kind1 > kind2 ? kind1 : kind2;
8873 buf1 = PyUnicode_DATA(s1);
8874 buf2 = PyUnicode_DATA(s2);
8875 if (kind1 != kind)
8876 buf1 = _PyUnicode_AsKind(s1, kind);
8877 if (!buf1)
8878 return -2;
8879 if (kind2 != kind)
8880 buf2 = _PyUnicode_AsKind(s2, kind);
8881 if (!buf2) {
8882 if (kind1 != kind) PyMem_Free(buf1);
8883 return -2;
8884 }
8885 len1 = PyUnicode_GET_LENGTH(s1);
8886 len2 = PyUnicode_GET_LENGTH(s2);
8887
Victor Stinner794d5672011-10-10 03:21:36 +02008888 if (direction > 0) {
8889 switch(kind) {
8890 case PyUnicode_1BYTE_KIND:
8891 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8892 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8893 else
8894 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8895 break;
8896 case PyUnicode_2BYTE_KIND:
8897 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8898 break;
8899 case PyUnicode_4BYTE_KIND:
8900 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8901 break;
8902 default:
8903 assert(0); result = -2;
8904 }
8905 }
8906 else {
8907 switch(kind) {
8908 case PyUnicode_1BYTE_KIND:
8909 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8910 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8911 else
8912 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8913 break;
8914 case PyUnicode_2BYTE_KIND:
8915 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8916 break;
8917 case PyUnicode_4BYTE_KIND:
8918 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8919 break;
8920 default:
8921 assert(0); result = -2;
8922 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008923 }
8924
8925 if (kind1 != kind)
8926 PyMem_Free(buf1);
8927 if (kind2 != kind)
8928 PyMem_Free(buf2);
8929
8930 return result;
8931}
8932
8933Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02008934_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008935 Py_ssize_t n_buffer,
8936 void *digits, Py_ssize_t n_digits,
8937 Py_ssize_t min_width,
8938 const char *grouping,
8939 const char *thousands_sep)
8940{
8941 switch(kind) {
8942 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008943 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
8944 return _PyUnicode_ascii_InsertThousandsGrouping(
8945 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8946 min_width, grouping, thousands_sep);
8947 else
8948 return _PyUnicode_ucs1_InsertThousandsGrouping(
8949 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8950 min_width, grouping, thousands_sep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008951 case PyUnicode_2BYTE_KIND:
8952 return _PyUnicode_ucs2_InsertThousandsGrouping(
8953 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8954 min_width, grouping, thousands_sep);
8955 case PyUnicode_4BYTE_KIND:
8956 return _PyUnicode_ucs4_InsertThousandsGrouping(
8957 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
8958 min_width, grouping, thousands_sep);
8959 }
8960 assert(0);
8961 return -1;
8962}
8963
8964
Thomas Wouters477c8d52006-05-27 19:21:47 +00008965/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008966#define ADJUST_INDICES(start, end, len) \
8967 if (end > len) \
8968 end = len; \
8969 else if (end < 0) { \
8970 end += len; \
8971 if (end < 0) \
8972 end = 0; \
8973 } \
8974 if (start < 0) { \
8975 start += len; \
8976 if (start < 0) \
8977 start = 0; \
8978 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008979
Alexander Belopolsky40018472011-02-26 01:02:56 +00008980Py_ssize_t
8981PyUnicode_Count(PyObject *str,
8982 PyObject *substr,
8983 Py_ssize_t start,
8984 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008985{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008986 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008987 PyObject* str_obj;
8988 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008989 int kind1, kind2, kind;
8990 void *buf1 = NULL, *buf2 = NULL;
8991 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008992
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008993 str_obj = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008994 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008995 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008996 sub_obj = PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02008997 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008998 Py_DECREF(str_obj);
8999 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009000 }
Tim Petersced69f82003-09-16 20:30:58 +00009001
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009002 kind1 = PyUnicode_KIND(str_obj);
9003 kind2 = PyUnicode_KIND(sub_obj);
9004 kind = kind1 > kind2 ? kind1 : kind2;
9005 buf1 = PyUnicode_DATA(str_obj);
9006 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009007 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009008 if (!buf1)
9009 goto onError;
9010 buf2 = PyUnicode_DATA(sub_obj);
9011 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009012 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009013 if (!buf2)
9014 goto onError;
9015 len1 = PyUnicode_GET_LENGTH(str_obj);
9016 len2 = PyUnicode_GET_LENGTH(sub_obj);
9017
9018 ADJUST_INDICES(start, end, len1);
9019 switch(kind) {
9020 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009021 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9022 result = asciilib_count(
9023 ((Py_UCS1*)buf1) + start, end - start,
9024 buf2, len2, PY_SSIZE_T_MAX
9025 );
9026 else
9027 result = ucs1lib_count(
9028 ((Py_UCS1*)buf1) + start, end - start,
9029 buf2, len2, PY_SSIZE_T_MAX
9030 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009031 break;
9032 case PyUnicode_2BYTE_KIND:
9033 result = ucs2lib_count(
9034 ((Py_UCS2*)buf1) + start, end - start,
9035 buf2, len2, PY_SSIZE_T_MAX
9036 );
9037 break;
9038 case PyUnicode_4BYTE_KIND:
9039 result = ucs4lib_count(
9040 ((Py_UCS4*)buf1) + start, end - start,
9041 buf2, len2, PY_SSIZE_T_MAX
9042 );
9043 break;
9044 default:
9045 assert(0); result = 0;
9046 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009047
9048 Py_DECREF(sub_obj);
9049 Py_DECREF(str_obj);
9050
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009051 if (kind1 != kind)
9052 PyMem_Free(buf1);
9053 if (kind2 != kind)
9054 PyMem_Free(buf2);
9055
Guido van Rossumd57fd912000-03-10 22:53:23 +00009056 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009057 onError:
9058 Py_DECREF(sub_obj);
9059 Py_DECREF(str_obj);
9060 if (kind1 != kind && buf1)
9061 PyMem_Free(buf1);
9062 if (kind2 != kind && buf2)
9063 PyMem_Free(buf2);
9064 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009065}
9066
Alexander Belopolsky40018472011-02-26 01:02:56 +00009067Py_ssize_t
9068PyUnicode_Find(PyObject *str,
9069 PyObject *sub,
9070 Py_ssize_t start,
9071 Py_ssize_t end,
9072 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009073{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009074 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009075
Guido van Rossumd57fd912000-03-10 22:53:23 +00009076 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009077 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009078 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009079 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009080 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009081 Py_DECREF(str);
9082 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009083 }
Tim Petersced69f82003-09-16 20:30:58 +00009084
Victor Stinner794d5672011-10-10 03:21:36 +02009085 result = any_find_slice(direction,
9086 str, sub, start, end
9087 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009088
Guido van Rossumd57fd912000-03-10 22:53:23 +00009089 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009090 Py_DECREF(sub);
9091
Guido van Rossumd57fd912000-03-10 22:53:23 +00009092 return result;
9093}
9094
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009095Py_ssize_t
9096PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9097 Py_ssize_t start, Py_ssize_t end,
9098 int direction)
9099{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009100 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009101 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009102 if (PyUnicode_READY(str) == -1)
9103 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009104 if (start < 0 || end < 0) {
9105 PyErr_SetString(PyExc_IndexError, "string index out of range");
9106 return -2;
9107 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009108 if (end > PyUnicode_GET_LENGTH(str))
9109 end = PyUnicode_GET_LENGTH(str);
9110 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009111 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9112 kind, end-start, ch, direction);
9113 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009114 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009115 else
9116 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009117}
9118
Alexander Belopolsky40018472011-02-26 01:02:56 +00009119static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009120tailmatch(PyObject *self,
9121 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009122 Py_ssize_t start,
9123 Py_ssize_t end,
9124 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009125{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009126 int kind_self;
9127 int kind_sub;
9128 void *data_self;
9129 void *data_sub;
9130 Py_ssize_t offset;
9131 Py_ssize_t i;
9132 Py_ssize_t end_sub;
9133
9134 if (PyUnicode_READY(self) == -1 ||
9135 PyUnicode_READY(substring) == -1)
9136 return 0;
9137
9138 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009139 return 1;
9140
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009141 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9142 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009143 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009144 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009145
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009146 kind_self = PyUnicode_KIND(self);
9147 data_self = PyUnicode_DATA(self);
9148 kind_sub = PyUnicode_KIND(substring);
9149 data_sub = PyUnicode_DATA(substring);
9150 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9151
9152 if (direction > 0)
9153 offset = end;
9154 else
9155 offset = start;
9156
9157 if (PyUnicode_READ(kind_self, data_self, offset) ==
9158 PyUnicode_READ(kind_sub, data_sub, 0) &&
9159 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9160 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9161 /* If both are of the same kind, memcmp is sufficient */
9162 if (kind_self == kind_sub) {
9163 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009164 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009165 data_sub,
9166 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009167 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009168 }
9169 /* otherwise we have to compare each character by first accesing it */
9170 else {
9171 /* We do not need to compare 0 and len(substring)-1 because
9172 the if statement above ensured already that they are equal
9173 when we end up here. */
9174 // TODO: honor direction and do a forward or backwards search
9175 for (i = 1; i < end_sub; ++i) {
9176 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9177 PyUnicode_READ(kind_sub, data_sub, i))
9178 return 0;
9179 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009180 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009181 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009182 }
9183
9184 return 0;
9185}
9186
Alexander Belopolsky40018472011-02-26 01:02:56 +00009187Py_ssize_t
9188PyUnicode_Tailmatch(PyObject *str,
9189 PyObject *substr,
9190 Py_ssize_t start,
9191 Py_ssize_t end,
9192 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009193{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009194 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009195
Guido van Rossumd57fd912000-03-10 22:53:23 +00009196 str = PyUnicode_FromObject(str);
9197 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009198 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009199 substr = PyUnicode_FromObject(substr);
9200 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009201 Py_DECREF(str);
9202 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009203 }
Tim Petersced69f82003-09-16 20:30:58 +00009204
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009205 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009206 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009207 Py_DECREF(str);
9208 Py_DECREF(substr);
9209 return result;
9210}
9211
Guido van Rossumd57fd912000-03-10 22:53:23 +00009212/* Apply fixfct filter to the Unicode object self and return a
9213 reference to the modified object */
9214
Alexander Belopolsky40018472011-02-26 01:02:56 +00009215static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009216fixup(PyObject *self,
9217 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009218{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009219 PyObject *u;
9220 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009221 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009222
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009223 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009224 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009225 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009226 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009227
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009228 /* fix functions return the new maximum character in a string,
9229 if the kind of the resulting unicode object does not change,
9230 everything is fine. Otherwise we need to change the string kind
9231 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009232 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009233
9234 if (maxchar_new == 0) {
9235 /* no changes */;
9236 if (PyUnicode_CheckExact(self)) {
9237 Py_DECREF(u);
9238 Py_INCREF(self);
9239 return self;
9240 }
9241 else
9242 return u;
9243 }
9244
9245 if (maxchar_new <= 127)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009246 maxchar_new = 127;
9247 else if (maxchar_new <= 255)
9248 maxchar_new = 255;
9249 else if (maxchar_new <= 65535)
9250 maxchar_new = 65535;
9251 else
Victor Stinner8faf8212011-12-08 22:14:11 +01009252 maxchar_new = MAX_UNICODE;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009253
Victor Stinnereaab6042011-12-11 22:22:39 +01009254 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009255 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009256
9257 /* In case the maximum character changed, we need to
9258 convert the string to the new category. */
9259 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9260 if (v == NULL) {
9261 Py_DECREF(u);
9262 return NULL;
9263 }
9264 if (maxchar_new > maxchar_old) {
9265 /* If the maxchar increased so that the kind changed, not all
9266 characters are representable anymore and we need to fix the
9267 string again. This only happens in very few cases. */
9268 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
9269 maxchar_old = fixfct(v);
9270 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009271 }
9272 else {
Victor Stinnereaab6042011-12-11 22:22:39 +01009273 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009274 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009275 Py_DECREF(u);
9276 assert(_PyUnicode_CheckConsistency(v, 1));
9277 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009278}
9279
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009280static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009281fixupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009282{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009283 /* No need to call PyUnicode_READY(self) because this function is only
9284 called as a callback from fixup() which does it already. */
9285 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9286 const int kind = PyUnicode_KIND(self);
9287 void *data = PyUnicode_DATA(self);
9288 int touched = 0;
9289 Py_UCS4 maxchar = 0;
9290 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009291
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009292 for (i = 0; i < len; ++i) {
9293 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9294 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
9295 if (up != ch) {
9296 if (up > maxchar)
9297 maxchar = up;
9298 PyUnicode_WRITE(kind, data, i, up);
9299 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009300 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009301 else if (ch > maxchar)
9302 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009303 }
9304
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009305 if (touched)
9306 return maxchar;
9307 else
9308 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009309}
9310
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009311static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009312fixlower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009313{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009314 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9315 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9316 const int kind = PyUnicode_KIND(self);
9317 void *data = PyUnicode_DATA(self);
9318 int touched = 0;
9319 Py_UCS4 maxchar = 0;
9320 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009321
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009322 for(i = 0; i < len; ++i) {
9323 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9324 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9325 if (lo != ch) {
9326 if (lo > maxchar)
9327 maxchar = lo;
9328 PyUnicode_WRITE(kind, data, i, lo);
9329 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009330 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009331 else if (ch > maxchar)
9332 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009333 }
9334
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009335 if (touched)
9336 return maxchar;
9337 else
9338 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009339}
9340
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009341static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009342fixswapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009343{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009344 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9345 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9346 const int kind = PyUnicode_KIND(self);
9347 void *data = PyUnicode_DATA(self);
9348 int touched = 0;
9349 Py_UCS4 maxchar = 0;
9350 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009351
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009352 for(i = 0; i < len; ++i) {
9353 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9354 Py_UCS4 nu = 0;
9355
9356 if (Py_UNICODE_ISUPPER(ch))
9357 nu = Py_UNICODE_TOLOWER(ch);
9358 else if (Py_UNICODE_ISLOWER(ch))
9359 nu = Py_UNICODE_TOUPPER(ch);
9360
9361 if (nu != 0) {
9362 if (nu > maxchar)
9363 maxchar = nu;
9364 PyUnicode_WRITE(kind, data, i, nu);
9365 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009366 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009367 else if (ch > maxchar)
9368 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009369 }
9370
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009371 if (touched)
9372 return maxchar;
9373 else
9374 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009375}
9376
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009377static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009378fixcapitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009379{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009380 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9381 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9382 const int kind = PyUnicode_KIND(self);
9383 void *data = PyUnicode_DATA(self);
9384 int touched = 0;
9385 Py_UCS4 maxchar = 0;
9386 Py_ssize_t i = 0;
9387 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00009388
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009389 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009390 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009391
9392 ch = PyUnicode_READ(kind, data, i);
9393 if (!Py_UNICODE_ISUPPER(ch)) {
9394 maxchar = Py_UNICODE_TOUPPER(ch);
9395 PyUnicode_WRITE(kind, data, i, maxchar);
9396 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009397 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009398 ++i;
9399 for(; i < len; ++i) {
9400 ch = PyUnicode_READ(kind, data, i);
9401 if (!Py_UNICODE_ISLOWER(ch)) {
9402 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9403 if (lo > maxchar)
9404 maxchar = lo;
9405 PyUnicode_WRITE(kind, data, i, lo);
9406 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009407 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009408 else if (ch > maxchar)
9409 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009410 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009411
9412 if (touched)
9413 return maxchar;
9414 else
9415 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009416}
9417
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009418static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009419fixtitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009420{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009421 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9422 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9423 const int kind = PyUnicode_KIND(self);
9424 void *data = PyUnicode_DATA(self);
9425 Py_UCS4 maxchar = 0;
9426 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009427 int previous_is_cased;
9428
9429 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009430 if (len == 1) {
9431 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9432 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
9433 if (ti != ch) {
9434 PyUnicode_WRITE(kind, data, i, ti);
9435 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00009436 }
9437 else
9438 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009439 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009440 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009441 for(; i < len; ++i) {
9442 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9443 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00009444
Benjamin Peterson29060642009-01-31 22:14:21 +00009445 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009446 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00009447 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009448 nu = Py_UNICODE_TOTITLE(ch);
9449
9450 if (nu > maxchar)
9451 maxchar = nu;
9452 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00009453
Benjamin Peterson29060642009-01-31 22:14:21 +00009454 if (Py_UNICODE_ISLOWER(ch) ||
9455 Py_UNICODE_ISUPPER(ch) ||
9456 Py_UNICODE_ISTITLE(ch))
9457 previous_is_cased = 1;
9458 else
9459 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009460 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009461 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009462}
9463
Tim Peters8ce9f162004-08-27 01:49:32 +00009464PyObject *
9465PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009466{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009467 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009468 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009469 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009470 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009471 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9472 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009473 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009474 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009475 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009476 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009477 int use_memcpy;
9478 unsigned char *res_data = NULL, *sep_data = NULL;
9479 PyObject *last_obj;
9480 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009481
Tim Peters05eba1f2004-08-27 21:32:02 +00009482 fseq = PySequence_Fast(seq, "");
9483 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009484 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009485 }
9486
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009487 /* NOTE: the following code can't call back into Python code,
9488 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009489 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009490
Tim Peters05eba1f2004-08-27 21:32:02 +00009491 seqlen = PySequence_Fast_GET_SIZE(fseq);
9492 /* If empty sequence, return u"". */
9493 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009494 Py_DECREF(fseq);
9495 Py_INCREF(unicode_empty);
9496 res = unicode_empty;
9497 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009498 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009499
Tim Peters05eba1f2004-08-27 21:32:02 +00009500 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009501 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009502 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009503 if (seqlen == 1) {
9504 if (PyUnicode_CheckExact(items[0])) {
9505 res = items[0];
9506 Py_INCREF(res);
9507 Py_DECREF(fseq);
9508 return res;
9509 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009510 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009511 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009512 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009513 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009514 /* Set up sep and seplen */
9515 if (separator == NULL) {
9516 /* fall back to a blank space separator */
9517 sep = PyUnicode_FromOrdinal(' ');
9518 if (!sep)
9519 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009520 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009521 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009522 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009523 else {
9524 if (!PyUnicode_Check(separator)) {
9525 PyErr_Format(PyExc_TypeError,
9526 "separator: expected str instance,"
9527 " %.80s found",
9528 Py_TYPE(separator)->tp_name);
9529 goto onError;
9530 }
9531 if (PyUnicode_READY(separator))
9532 goto onError;
9533 sep = separator;
9534 seplen = PyUnicode_GET_LENGTH(separator);
9535 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9536 /* inc refcount to keep this code path symmetric with the
9537 above case of a blank separator */
9538 Py_INCREF(sep);
9539 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009540 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009541 }
9542
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009543 /* There are at least two things to join, or else we have a subclass
9544 * of str in the sequence.
9545 * Do a pre-pass to figure out the total amount of space we'll
9546 * need (sz), and see whether all argument are strings.
9547 */
9548 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009549#ifdef Py_DEBUG
9550 use_memcpy = 0;
9551#else
9552 use_memcpy = 1;
9553#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009554 for (i = 0; i < seqlen; i++) {
9555 const Py_ssize_t old_sz = sz;
9556 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009557 if (!PyUnicode_Check(item)) {
9558 PyErr_Format(PyExc_TypeError,
9559 "sequence item %zd: expected str instance,"
9560 " %.80s found",
9561 i, Py_TYPE(item)->tp_name);
9562 goto onError;
9563 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009564 if (PyUnicode_READY(item) == -1)
9565 goto onError;
9566 sz += PyUnicode_GET_LENGTH(item);
9567 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009568 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009569 if (i != 0)
9570 sz += seplen;
9571 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9572 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009573 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009574 goto onError;
9575 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009576 if (use_memcpy && last_obj != NULL) {
9577 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9578 use_memcpy = 0;
9579 }
9580 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009581 }
Tim Petersced69f82003-09-16 20:30:58 +00009582
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009583 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009584 if (res == NULL)
9585 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009586
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009587 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009588#ifdef Py_DEBUG
9589 use_memcpy = 0;
9590#else
9591 if (use_memcpy) {
9592 res_data = PyUnicode_1BYTE_DATA(res);
9593 kind = PyUnicode_KIND(res);
9594 if (seplen != 0)
9595 sep_data = PyUnicode_1BYTE_DATA(sep);
9596 }
9597#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009598 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009599 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009600 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009601 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009602 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009603 if (use_memcpy) {
9604 Py_MEMCPY(res_data,
9605 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009606 kind * seplen);
9607 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009608 }
9609 else {
9610 copy_characters(res, res_offset, sep, 0, seplen);
9611 res_offset += seplen;
9612 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009613 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009614 itemlen = PyUnicode_GET_LENGTH(item);
9615 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009616 if (use_memcpy) {
9617 Py_MEMCPY(res_data,
9618 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009619 kind * itemlen);
9620 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009621 }
9622 else {
9623 copy_characters(res, res_offset, item, 0, itemlen);
9624 res_offset += itemlen;
9625 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009626 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009627 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009628 if (use_memcpy)
9629 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009630 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009631 else
9632 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009633
Tim Peters05eba1f2004-08-27 21:32:02 +00009634 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009635 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009636 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009637 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009638
Benjamin Peterson29060642009-01-31 22:14:21 +00009639 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009640 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009641 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009642 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009643 return NULL;
9644}
9645
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009646#define FILL(kind, data, value, start, length) \
9647 do { \
9648 Py_ssize_t i_ = 0; \
9649 assert(kind != PyUnicode_WCHAR_KIND); \
9650 switch ((kind)) { \
9651 case PyUnicode_1BYTE_KIND: { \
9652 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9653 memset(to_, (unsigned char)value, length); \
9654 break; \
9655 } \
9656 case PyUnicode_2BYTE_KIND: { \
9657 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9658 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9659 break; \
9660 } \
9661 default: { \
9662 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9663 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9664 break; \
9665 } \
9666 } \
9667 } while (0)
9668
Victor Stinner9310abb2011-10-05 00:59:23 +02009669static PyObject *
9670pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009671 Py_ssize_t left,
9672 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009673 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009674{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009675 PyObject *u;
9676 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009677 int kind;
9678 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009679
9680 if (left < 0)
9681 left = 0;
9682 if (right < 0)
9683 right = 0;
9684
Victor Stinnerc4b49542011-12-11 22:44:26 +01009685 if (left == 0 && right == 0)
9686 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009687
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009688 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9689 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009690 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9691 return NULL;
9692 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009693 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9694 if (fill > maxchar)
9695 maxchar = fill;
9696 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009697 if (!u)
9698 return NULL;
9699
9700 kind = PyUnicode_KIND(u);
9701 data = PyUnicode_DATA(u);
9702 if (left)
9703 FILL(kind, data, fill, 0, left);
9704 if (right)
9705 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009706 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009707 assert(_PyUnicode_CheckConsistency(u, 1));
9708 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009709}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009710#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009711
Alexander Belopolsky40018472011-02-26 01:02:56 +00009712PyObject *
9713PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009714{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009715 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009716
9717 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009718 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009719 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009720
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009721 switch(PyUnicode_KIND(string)) {
9722 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009723 if (PyUnicode_IS_ASCII(string))
9724 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009725 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009726 PyUnicode_GET_LENGTH(string), keepends);
9727 else
9728 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009729 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009730 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009731 break;
9732 case PyUnicode_2BYTE_KIND:
9733 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009734 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009735 PyUnicode_GET_LENGTH(string), keepends);
9736 break;
9737 case PyUnicode_4BYTE_KIND:
9738 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009739 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009740 PyUnicode_GET_LENGTH(string), keepends);
9741 break;
9742 default:
9743 assert(0);
9744 list = 0;
9745 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009746 Py_DECREF(string);
9747 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009748}
9749
Alexander Belopolsky40018472011-02-26 01:02:56 +00009750static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009751split(PyObject *self,
9752 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009753 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009754{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009755 int kind1, kind2, kind;
9756 void *buf1, *buf2;
9757 Py_ssize_t len1, len2;
9758 PyObject* out;
9759
Guido van Rossumd57fd912000-03-10 22:53:23 +00009760 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009761 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009762
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009763 if (PyUnicode_READY(self) == -1)
9764 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009765
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009766 if (substring == NULL)
9767 switch(PyUnicode_KIND(self)) {
9768 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009769 if (PyUnicode_IS_ASCII(self))
9770 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009771 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009772 PyUnicode_GET_LENGTH(self), maxcount
9773 );
9774 else
9775 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009776 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009777 PyUnicode_GET_LENGTH(self), maxcount
9778 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009779 case PyUnicode_2BYTE_KIND:
9780 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009781 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009782 PyUnicode_GET_LENGTH(self), maxcount
9783 );
9784 case PyUnicode_4BYTE_KIND:
9785 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009786 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009787 PyUnicode_GET_LENGTH(self), maxcount
9788 );
9789 default:
9790 assert(0);
9791 return NULL;
9792 }
9793
9794 if (PyUnicode_READY(substring) == -1)
9795 return NULL;
9796
9797 kind1 = PyUnicode_KIND(self);
9798 kind2 = PyUnicode_KIND(substring);
9799 kind = kind1 > kind2 ? kind1 : kind2;
9800 buf1 = PyUnicode_DATA(self);
9801 buf2 = PyUnicode_DATA(substring);
9802 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009803 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009804 if (!buf1)
9805 return NULL;
9806 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009807 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009808 if (!buf2) {
9809 if (kind1 != kind) PyMem_Free(buf1);
9810 return NULL;
9811 }
9812 len1 = PyUnicode_GET_LENGTH(self);
9813 len2 = PyUnicode_GET_LENGTH(substring);
9814
9815 switch(kind) {
9816 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009817 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9818 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009819 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009820 else
9821 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009822 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009823 break;
9824 case PyUnicode_2BYTE_KIND:
9825 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009826 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009827 break;
9828 case PyUnicode_4BYTE_KIND:
9829 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009830 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009831 break;
9832 default:
9833 out = NULL;
9834 }
9835 if (kind1 != kind)
9836 PyMem_Free(buf1);
9837 if (kind2 != kind)
9838 PyMem_Free(buf2);
9839 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009840}
9841
Alexander Belopolsky40018472011-02-26 01:02:56 +00009842static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009843rsplit(PyObject *self,
9844 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009845 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009846{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009847 int kind1, kind2, kind;
9848 void *buf1, *buf2;
9849 Py_ssize_t len1, len2;
9850 PyObject* out;
9851
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009852 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009853 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009854
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009855 if (PyUnicode_READY(self) == -1)
9856 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009857
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009858 if (substring == NULL)
9859 switch(PyUnicode_KIND(self)) {
9860 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009861 if (PyUnicode_IS_ASCII(self))
9862 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009863 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009864 PyUnicode_GET_LENGTH(self), maxcount
9865 );
9866 else
9867 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009868 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009869 PyUnicode_GET_LENGTH(self), maxcount
9870 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009871 case PyUnicode_2BYTE_KIND:
9872 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009873 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009874 PyUnicode_GET_LENGTH(self), maxcount
9875 );
9876 case PyUnicode_4BYTE_KIND:
9877 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009878 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009879 PyUnicode_GET_LENGTH(self), maxcount
9880 );
9881 default:
9882 assert(0);
9883 return NULL;
9884 }
9885
9886 if (PyUnicode_READY(substring) == -1)
9887 return NULL;
9888
9889 kind1 = PyUnicode_KIND(self);
9890 kind2 = PyUnicode_KIND(substring);
9891 kind = kind1 > kind2 ? kind1 : kind2;
9892 buf1 = PyUnicode_DATA(self);
9893 buf2 = PyUnicode_DATA(substring);
9894 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009895 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009896 if (!buf1)
9897 return NULL;
9898 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009899 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009900 if (!buf2) {
9901 if (kind1 != kind) PyMem_Free(buf1);
9902 return NULL;
9903 }
9904 len1 = PyUnicode_GET_LENGTH(self);
9905 len2 = PyUnicode_GET_LENGTH(substring);
9906
9907 switch(kind) {
9908 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009909 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9910 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009911 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009912 else
9913 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009914 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009915 break;
9916 case PyUnicode_2BYTE_KIND:
9917 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009918 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009919 break;
9920 case PyUnicode_4BYTE_KIND:
9921 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009922 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009923 break;
9924 default:
9925 out = NULL;
9926 }
9927 if (kind1 != kind)
9928 PyMem_Free(buf1);
9929 if (kind2 != kind)
9930 PyMem_Free(buf2);
9931 return out;
9932}
9933
9934static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009935anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9936 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009937{
9938 switch(kind) {
9939 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009940 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9941 return asciilib_find(buf1, len1, buf2, len2, offset);
9942 else
9943 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009944 case PyUnicode_2BYTE_KIND:
9945 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9946 case PyUnicode_4BYTE_KIND:
9947 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9948 }
9949 assert(0);
9950 return -1;
9951}
9952
9953static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009954anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
9955 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009956{
9957 switch(kind) {
9958 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009959 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
9960 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
9961 else
9962 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009963 case PyUnicode_2BYTE_KIND:
9964 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9965 case PyUnicode_4BYTE_KIND:
9966 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9967 }
9968 assert(0);
9969 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009970}
9971
Alexander Belopolsky40018472011-02-26 01:02:56 +00009972static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009973replace(PyObject *self, PyObject *str1,
9974 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009975{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009976 PyObject *u;
9977 char *sbuf = PyUnicode_DATA(self);
9978 char *buf1 = PyUnicode_DATA(str1);
9979 char *buf2 = PyUnicode_DATA(str2);
9980 int srelease = 0, release1 = 0, release2 = 0;
9981 int skind = PyUnicode_KIND(self);
9982 int kind1 = PyUnicode_KIND(str1);
9983 int kind2 = PyUnicode_KIND(str2);
9984 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9985 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9986 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +02009987 int mayshrink;
9988 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009989
9990 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009991 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009992 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009993 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009994
Victor Stinner59de0ee2011-10-07 10:01:28 +02009995 if (str1 == str2)
9996 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009997 if (skind < kind1)
9998 /* substring too wide to be present */
9999 goto nothing;
10000
Victor Stinner49a0a212011-10-12 23:46:10 +020010001 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10002 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10003 /* Replacing str1 with str2 may cause a maxchar reduction in the
10004 result string. */
10005 mayshrink = (maxchar_str2 < maxchar);
10006 maxchar = Py_MAX(maxchar, maxchar_str2);
10007
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010008 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +000010009 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010010 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010011 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010012 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010013 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010014 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010015 Py_UCS4 u1, u2;
10016 int rkind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010017 u1 = PyUnicode_READ_CHAR(str1, 0);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +020010018 if (findchar(sbuf, PyUnicode_KIND(self),
10019 slen, u1, 1) < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010020 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010021 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010022 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010023 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010024 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010025 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010026 rkind = PyUnicode_KIND(u);
10027 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
10028 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010029 if (--maxcount < 0)
10030 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010031 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010032 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010033 }
10034 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010035 int rkind = skind;
10036 char *res;
Victor Stinner25a4b292011-10-06 12:31:55 +020010037
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010038 if (kind1 < rkind) {
10039 /* widen substring */
10040 buf1 = _PyUnicode_AsKind(str1, rkind);
10041 if (!buf1) goto error;
10042 release1 = 1;
10043 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010044 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010045 if (i < 0)
10046 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010047 if (rkind > kind2) {
10048 /* widen replacement */
10049 buf2 = _PyUnicode_AsKind(str2, rkind);
10050 if (!buf2) goto error;
10051 release2 = 1;
10052 }
10053 else if (rkind < kind2) {
10054 /* widen self and buf1 */
10055 rkind = kind2;
10056 if (release1) PyMem_Free(buf1);
10057 sbuf = _PyUnicode_AsKind(self, rkind);
10058 if (!sbuf) goto error;
10059 srelease = 1;
10060 buf1 = _PyUnicode_AsKind(str1, rkind);
10061 if (!buf1) goto error;
10062 release1 = 1;
10063 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010064 u = PyUnicode_New(slen, maxchar);
10065 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010066 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010067 assert(PyUnicode_KIND(u) == rkind);
10068 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010069
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010070 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010071 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010072 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010073 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010074 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010075 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010076
10077 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010078 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010079 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010080 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010081 if (i == -1)
10082 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010083 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010084 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010085 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010086 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010087 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010088 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010089 }
10090 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010091 Py_ssize_t n, i, j, ires;
10092 Py_ssize_t product, new_size;
10093 int rkind = skind;
10094 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010095
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010096 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010097 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010098 buf1 = _PyUnicode_AsKind(str1, rkind);
10099 if (!buf1) goto error;
10100 release1 = 1;
10101 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010102 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010103 if (n == 0)
10104 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010105 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010106 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010107 buf2 = _PyUnicode_AsKind(str2, rkind);
10108 if (!buf2) goto error;
10109 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010110 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010111 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010112 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010113 rkind = kind2;
10114 sbuf = _PyUnicode_AsKind(self, rkind);
10115 if (!sbuf) goto error;
10116 srelease = 1;
10117 if (release1) PyMem_Free(buf1);
10118 buf1 = _PyUnicode_AsKind(str1, rkind);
10119 if (!buf1) goto error;
10120 release1 = 1;
10121 }
10122 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10123 PyUnicode_GET_LENGTH(str1))); */
10124 product = n * (len2-len1);
10125 if ((product / (len2-len1)) != n) {
10126 PyErr_SetString(PyExc_OverflowError,
10127 "replace string is too long");
10128 goto error;
10129 }
10130 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +020010131 if (new_size == 0) {
10132 Py_INCREF(unicode_empty);
10133 u = unicode_empty;
10134 goto done;
10135 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010136 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10137 PyErr_SetString(PyExc_OverflowError,
10138 "replace string is too long");
10139 goto error;
10140 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010141 u = PyUnicode_New(new_size, maxchar);
10142 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010143 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010144 assert(PyUnicode_KIND(u) == rkind);
10145 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010146 ires = i = 0;
10147 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010148 while (n-- > 0) {
10149 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010150 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010151 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010152 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010153 if (j == -1)
10154 break;
10155 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010156 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010157 memcpy(res + rkind * ires,
10158 sbuf + rkind * i,
10159 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010160 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010161 }
10162 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010163 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010164 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010165 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010166 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010167 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010168 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010169 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010170 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010171 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010172 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010173 memcpy(res + rkind * ires,
10174 sbuf + rkind * i,
10175 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010176 }
10177 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010178 /* interleave */
10179 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010180 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010181 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010182 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010183 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010184 if (--n <= 0)
10185 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010186 memcpy(res + rkind * ires,
10187 sbuf + rkind * i,
10188 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010189 ires++;
10190 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010191 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010192 memcpy(res + rkind * ires,
10193 sbuf + rkind * i,
10194 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010195 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010196 }
10197
10198 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010199 unicode_adjust_maxchar(&u);
10200 if (u == NULL)
10201 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010202 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010203
10204 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010205 if (srelease)
10206 PyMem_FREE(sbuf);
10207 if (release1)
10208 PyMem_FREE(buf1);
10209 if (release2)
10210 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010211 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010212 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010213
Benjamin Peterson29060642009-01-31 22:14:21 +000010214 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010215 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010216 if (srelease)
10217 PyMem_FREE(sbuf);
10218 if (release1)
10219 PyMem_FREE(buf1);
10220 if (release2)
10221 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010222 return unicode_result_unchanged(self);
10223
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010224 error:
10225 if (srelease && sbuf)
10226 PyMem_FREE(sbuf);
10227 if (release1 && buf1)
10228 PyMem_FREE(buf1);
10229 if (release2 && buf2)
10230 PyMem_FREE(buf2);
10231 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010232}
10233
10234/* --- Unicode Object Methods --------------------------------------------- */
10235
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010236PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010237 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010238\n\
10239Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010240characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010241
10242static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010243unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010244{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010245 return fixup(self, fixtitle);
10246}
10247
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010248PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010249 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010250\n\
10251Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010252have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010253
10254static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010255unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010256{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010257 return fixup(self, fixcapitalize);
10258}
10259
10260#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010261PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010262 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010263\n\
10264Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010265normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010266
10267static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010268unicode_capwords(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010269{
10270 PyObject *list;
10271 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010272 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010273
Guido van Rossumd57fd912000-03-10 22:53:23 +000010274 /* Split into words */
10275 list = split(self, NULL, -1);
10276 if (!list)
10277 return NULL;
10278
10279 /* Capitalize each word */
10280 for (i = 0; i < PyList_GET_SIZE(list); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010281 item = fixup(PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +000010282 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010283 if (item == NULL)
10284 goto onError;
10285 Py_DECREF(PyList_GET_ITEM(list, i));
10286 PyList_SET_ITEM(list, i, item);
10287 }
10288
10289 /* Join the words to form a new string */
10290 item = PyUnicode_Join(NULL, list);
10291
Benjamin Peterson29060642009-01-31 22:14:21 +000010292 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010293 Py_DECREF(list);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010294 return item;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010295}
10296#endif
10297
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010298/* Argument converter. Coerces to a single unicode character */
10299
10300static int
10301convert_uc(PyObject *obj, void *addr)
10302{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010303 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010304 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010305
Benjamin Peterson14339b62009-01-31 16:36:08 +000010306 uniobj = PyUnicode_FromObject(obj);
10307 if (uniobj == NULL) {
10308 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010309 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010310 return 0;
10311 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010312 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010313 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010314 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010315 Py_DECREF(uniobj);
10316 return 0;
10317 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010318 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010319 Py_DECREF(uniobj);
10320 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010321}
10322
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010323PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010324 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010325\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010326Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010327done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010328
10329static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010330unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010331{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010332 Py_ssize_t marg, left;
10333 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010334 Py_UCS4 fillchar = ' ';
10335
Victor Stinnere9a29352011-10-01 02:14:59 +020010336 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010337 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010338
Victor Stinnerc4b49542011-12-11 22:44:26 +010010339 if (PyUnicode_READY(self) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010340 return NULL;
10341
Victor Stinnerc4b49542011-12-11 22:44:26 +010010342 if (PyUnicode_GET_LENGTH(self) >= width)
10343 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010344
Victor Stinnerc4b49542011-12-11 22:44:26 +010010345 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010346 left = marg / 2 + (marg & width & 1);
10347
Victor Stinner9310abb2011-10-05 00:59:23 +020010348 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010349}
10350
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010351/* This function assumes that str1 and str2 are readied by the caller. */
10352
Marc-André Lemburge5034372000-08-08 08:04:29 +000010353static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010354unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010355{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010356 int kind1, kind2;
10357 void *data1, *data2;
10358 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010359
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010360 kind1 = PyUnicode_KIND(str1);
10361 kind2 = PyUnicode_KIND(str2);
10362 data1 = PyUnicode_DATA(str1);
10363 data2 = PyUnicode_DATA(str2);
10364 len1 = PyUnicode_GET_LENGTH(str1);
10365 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010366
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010367 for (i = 0; i < len1 && i < len2; ++i) {
10368 Py_UCS4 c1, c2;
10369 c1 = PyUnicode_READ(kind1, data1, i);
10370 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010371
10372 if (c1 != c2)
10373 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010374 }
10375
10376 return (len1 < len2) ? -1 : (len1 != len2);
10377}
10378
Alexander Belopolsky40018472011-02-26 01:02:56 +000010379int
10380PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010381{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010382 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10383 if (PyUnicode_READY(left) == -1 ||
10384 PyUnicode_READY(right) == -1)
10385 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010386 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010387 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010388 PyErr_Format(PyExc_TypeError,
10389 "Can't compare %.100s and %.100s",
10390 left->ob_type->tp_name,
10391 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010392 return -1;
10393}
10394
Martin v. Löwis5b222132007-06-10 09:51:05 +000010395int
10396PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10397{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010398 Py_ssize_t i;
10399 int kind;
10400 void *data;
10401 Py_UCS4 chr;
10402
Victor Stinner910337b2011-10-03 03:20:16 +020010403 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010404 if (PyUnicode_READY(uni) == -1)
10405 return -1;
10406 kind = PyUnicode_KIND(uni);
10407 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010408 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010409 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10410 if (chr != str[i])
10411 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010412 /* This check keeps Python strings that end in '\0' from comparing equal
10413 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010414 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010415 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010416 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010417 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010418 return 0;
10419}
10420
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010421
Benjamin Peterson29060642009-01-31 22:14:21 +000010422#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010423 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010424
Alexander Belopolsky40018472011-02-26 01:02:56 +000010425PyObject *
10426PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010427{
10428 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010429
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010430 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10431 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010432 if (PyUnicode_READY(left) == -1 ||
10433 PyUnicode_READY(right) == -1)
10434 return NULL;
10435 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10436 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010437 if (op == Py_EQ) {
10438 Py_INCREF(Py_False);
10439 return Py_False;
10440 }
10441 if (op == Py_NE) {
10442 Py_INCREF(Py_True);
10443 return Py_True;
10444 }
10445 }
10446 if (left == right)
10447 result = 0;
10448 else
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010449 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010450
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010451 /* Convert the return value to a Boolean */
10452 switch (op) {
10453 case Py_EQ:
10454 v = TEST_COND(result == 0);
10455 break;
10456 case Py_NE:
10457 v = TEST_COND(result != 0);
10458 break;
10459 case Py_LE:
10460 v = TEST_COND(result <= 0);
10461 break;
10462 case Py_GE:
10463 v = TEST_COND(result >= 0);
10464 break;
10465 case Py_LT:
10466 v = TEST_COND(result == -1);
10467 break;
10468 case Py_GT:
10469 v = TEST_COND(result == 1);
10470 break;
10471 default:
10472 PyErr_BadArgument();
10473 return NULL;
10474 }
10475 Py_INCREF(v);
10476 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010477 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010478
Brian Curtindfc80e32011-08-10 20:28:54 -050010479 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010480}
10481
Alexander Belopolsky40018472011-02-26 01:02:56 +000010482int
10483PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010484{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010485 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010486 int kind1, kind2, kind;
10487 void *buf1, *buf2;
10488 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010489 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010490
10491 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010492 sub = PyUnicode_FromObject(element);
10493 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010494 PyErr_Format(PyExc_TypeError,
10495 "'in <string>' requires string as left operand, not %s",
10496 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010497 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010498 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010499 if (PyUnicode_READY(sub) == -1)
10500 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010501
Thomas Wouters477c8d52006-05-27 19:21:47 +000010502 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +020010503 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010504 Py_DECREF(sub);
10505 return -1;
10506 }
10507
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010508 kind1 = PyUnicode_KIND(str);
10509 kind2 = PyUnicode_KIND(sub);
10510 kind = kind1 > kind2 ? kind1 : kind2;
10511 buf1 = PyUnicode_DATA(str);
10512 buf2 = PyUnicode_DATA(sub);
10513 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010514 buf1 = _PyUnicode_AsKind(str, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010515 if (!buf1) {
10516 Py_DECREF(sub);
10517 return -1;
10518 }
10519 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010520 buf2 = _PyUnicode_AsKind(sub, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010521 if (!buf2) {
10522 Py_DECREF(sub);
10523 if (kind1 != kind) PyMem_Free(buf1);
10524 return -1;
10525 }
10526 len1 = PyUnicode_GET_LENGTH(str);
10527 len2 = PyUnicode_GET_LENGTH(sub);
10528
10529 switch(kind) {
10530 case PyUnicode_1BYTE_KIND:
10531 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10532 break;
10533 case PyUnicode_2BYTE_KIND:
10534 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10535 break;
10536 case PyUnicode_4BYTE_KIND:
10537 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10538 break;
10539 default:
10540 result = -1;
10541 assert(0);
10542 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010543
10544 Py_DECREF(str);
10545 Py_DECREF(sub);
10546
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010547 if (kind1 != kind)
10548 PyMem_Free(buf1);
10549 if (kind2 != kind)
10550 PyMem_Free(buf2);
10551
Guido van Rossum403d68b2000-03-13 15:55:09 +000010552 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010553}
10554
Guido van Rossumd57fd912000-03-10 22:53:23 +000010555/* Concat to string or Unicode object giving a new Unicode object. */
10556
Alexander Belopolsky40018472011-02-26 01:02:56 +000010557PyObject *
10558PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010559{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010560 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010561 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010562 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010563
10564 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010565 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010566 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010567 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010568 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010569 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010570 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010571
10572 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010573 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010574 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010575 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010576 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010577 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010578 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010579 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010580 }
10581
Victor Stinner488fa492011-12-12 00:01:39 +010010582 u_len = PyUnicode_GET_LENGTH(u);
10583 v_len = PyUnicode_GET_LENGTH(v);
10584 if (u_len > PY_SSIZE_T_MAX - v_len) {
10585 PyErr_SetString(PyExc_OverflowError,
10586 "strings are too large to concat");
10587 goto onError;
10588 }
10589 new_len = u_len + v_len;
10590
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010591 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010592 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
10593 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010594
Guido van Rossumd57fd912000-03-10 22:53:23 +000010595 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010010596 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010597 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010598 goto onError;
Victor Stinner488fa492011-12-12 00:01:39 +010010599 copy_characters(w, 0, u, 0, u_len);
10600 copy_characters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010601 Py_DECREF(u);
10602 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010603 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010604 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010605
Benjamin Peterson29060642009-01-31 22:14:21 +000010606 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010607 Py_XDECREF(u);
10608 Py_XDECREF(v);
10609 return NULL;
10610}
10611
Walter Dörwald1ab83302007-05-18 17:15:44 +000010612void
Victor Stinner23e56682011-10-03 03:54:37 +020010613PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010614{
Victor Stinner23e56682011-10-03 03:54:37 +020010615 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010010616 Py_UCS4 maxchar, maxchar2;
10617 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020010618
10619 if (p_left == NULL) {
10620 if (!PyErr_Occurred())
10621 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010622 return;
10623 }
Victor Stinner23e56682011-10-03 03:54:37 +020010624 left = *p_left;
10625 if (right == NULL || !PyUnicode_Check(left)) {
10626 if (!PyErr_Occurred())
10627 PyErr_BadInternalCall();
10628 goto error;
10629 }
10630
Victor Stinnere1335c72011-10-04 20:53:03 +020010631 if (PyUnicode_READY(left))
10632 goto error;
10633 if (PyUnicode_READY(right))
10634 goto error;
10635
Victor Stinner488fa492011-12-12 00:01:39 +010010636 /* Shortcuts */
10637 if (left == unicode_empty) {
10638 Py_DECREF(left);
10639 Py_INCREF(right);
10640 *p_left = right;
10641 return;
10642 }
10643 if (right == unicode_empty)
10644 return;
10645
10646 left_len = PyUnicode_GET_LENGTH(left);
10647 right_len = PyUnicode_GET_LENGTH(right);
10648 if (left_len > PY_SSIZE_T_MAX - right_len) {
10649 PyErr_SetString(PyExc_OverflowError,
10650 "strings are too large to concat");
10651 goto error;
10652 }
10653 new_len = left_len + right_len;
10654
10655 if (unicode_modifiable(left)
10656 && PyUnicode_CheckExact(right)
10657 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020010658 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10659 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010660 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010661 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010010662 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10663 {
10664 /* append inplace */
10665 if (unicode_resize(p_left, new_len) != 0) {
10666 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10667 * deallocated so it cannot be put back into
10668 * 'variable'. The MemoryError is raised when there
10669 * is no value in 'variable', which might (very
10670 * remotely) be a cause of incompatibilities.
10671 */
10672 goto error;
Victor Stinner23e56682011-10-03 03:54:37 +020010673 }
Victor Stinner488fa492011-12-12 00:01:39 +010010674 /* copy 'right' into the newly allocated area of 'left' */
10675 copy_characters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020010676 }
Victor Stinner488fa492011-12-12 00:01:39 +010010677 else {
10678 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
10679 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
10680 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020010681
Victor Stinner488fa492011-12-12 00:01:39 +010010682 /* Concat the two Unicode strings */
10683 res = PyUnicode_New(new_len, maxchar);
10684 if (res == NULL)
10685 goto error;
10686 copy_characters(res, 0, left, 0, left_len);
10687 copy_characters(res, left_len, right, 0, right_len);
10688 Py_DECREF(left);
10689 *p_left = res;
10690 }
10691 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010692 return;
10693
10694error:
Victor Stinner488fa492011-12-12 00:01:39 +010010695 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010696}
10697
10698void
10699PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10700{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010701 PyUnicode_Append(pleft, right);
10702 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010703}
10704
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010705PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010706 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010707\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010708Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010709string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010710interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010711
10712static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010713unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010714{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010715 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010716 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010717 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010718 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010719 int kind1, kind2, kind;
10720 void *buf1, *buf2;
10721 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010722
Jesus Ceaac451502011-04-20 17:09:23 +020010723 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10724 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010725 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010726
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010727 kind1 = PyUnicode_KIND(self);
10728 kind2 = PyUnicode_KIND(substring);
10729 kind = kind1 > kind2 ? kind1 : kind2;
10730 buf1 = PyUnicode_DATA(self);
10731 buf2 = PyUnicode_DATA(substring);
10732 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010733 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010734 if (!buf1) {
10735 Py_DECREF(substring);
10736 return NULL;
10737 }
10738 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010739 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010740 if (!buf2) {
10741 Py_DECREF(substring);
10742 if (kind1 != kind) PyMem_Free(buf1);
10743 return NULL;
10744 }
10745 len1 = PyUnicode_GET_LENGTH(self);
10746 len2 = PyUnicode_GET_LENGTH(substring);
10747
10748 ADJUST_INDICES(start, end, len1);
10749 switch(kind) {
10750 case PyUnicode_1BYTE_KIND:
10751 iresult = ucs1lib_count(
10752 ((Py_UCS1*)buf1) + start, end - start,
10753 buf2, len2, PY_SSIZE_T_MAX
10754 );
10755 break;
10756 case PyUnicode_2BYTE_KIND:
10757 iresult = ucs2lib_count(
10758 ((Py_UCS2*)buf1) + start, end - start,
10759 buf2, len2, PY_SSIZE_T_MAX
10760 );
10761 break;
10762 case PyUnicode_4BYTE_KIND:
10763 iresult = ucs4lib_count(
10764 ((Py_UCS4*)buf1) + start, end - start,
10765 buf2, len2, PY_SSIZE_T_MAX
10766 );
10767 break;
10768 default:
10769 assert(0); iresult = 0;
10770 }
10771
10772 result = PyLong_FromSsize_t(iresult);
10773
10774 if (kind1 != kind)
10775 PyMem_Free(buf1);
10776 if (kind2 != kind)
10777 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010778
10779 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010780
Guido van Rossumd57fd912000-03-10 22:53:23 +000010781 return result;
10782}
10783
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010784PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010785 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010786\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010787Encode S using the codec registered for encoding. Default encoding\n\
10788is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010789handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010790a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10791'xmlcharrefreplace' as well as any other name registered with\n\
10792codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010793
10794static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010795unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010796{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010797 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010798 char *encoding = NULL;
10799 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010800
Benjamin Peterson308d6372009-09-18 21:42:35 +000010801 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10802 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010803 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010804 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010805}
10806
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010807PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010808 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010809\n\
10810Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010811If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010812
10813static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010814unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010815{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010816 Py_ssize_t i, j, line_pos, src_len, incr;
10817 Py_UCS4 ch;
10818 PyObject *u;
10819 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010820 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010821 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010822 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010823
10824 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010825 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010826
Antoine Pitrou22425222011-10-04 19:10:51 +020010827 if (PyUnicode_READY(self) == -1)
10828 return NULL;
10829
Thomas Wouters7e474022000-07-16 12:04:32 +000010830 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010831 src_len = PyUnicode_GET_LENGTH(self);
10832 i = j = line_pos = 0;
10833 kind = PyUnicode_KIND(self);
10834 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010835 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010836 for (; i < src_len; i++) {
10837 ch = PyUnicode_READ(kind, src_data, i);
10838 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010839 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010840 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010841 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010842 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010843 goto overflow;
10844 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010845 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010846 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010847 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010848 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010849 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010850 goto overflow;
10851 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010852 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010853 if (ch == '\n' || ch == '\r')
10854 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010855 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010856 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010010857 if (!found)
10858 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010859
Guido van Rossumd57fd912000-03-10 22:53:23 +000010860 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010861 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010862 if (!u)
10863 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010864 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010865
Antoine Pitroue71d5742011-10-04 15:55:09 +020010866 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010867
Antoine Pitroue71d5742011-10-04 15:55:09 +020010868 for (; i < src_len; i++) {
10869 ch = PyUnicode_READ(kind, src_data, i);
10870 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010871 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010872 incr = tabsize - (line_pos % tabsize);
10873 line_pos += incr;
10874 while (incr--) {
10875 PyUnicode_WRITE(kind, dest_data, j, ' ');
10876 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010877 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010878 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010879 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010880 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010881 line_pos++;
10882 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010883 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010884 if (ch == '\n' || ch == '\r')
10885 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010886 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010887 }
10888 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010010889 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010890
Antoine Pitroue71d5742011-10-04 15:55:09 +020010891 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010892 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10893 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010894}
10895
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010896PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010897 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010898\n\
10899Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010900such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010901arguments start and end are interpreted as in slice notation.\n\
10902\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010903Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010904
10905static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010906unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010907{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010908 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010909 Py_ssize_t start;
10910 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010911 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010912
Jesus Ceaac451502011-04-20 17:09:23 +020010913 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10914 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010915 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010916
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010917 if (PyUnicode_READY(self) == -1)
10918 return NULL;
10919 if (PyUnicode_READY(substring) == -1)
10920 return NULL;
10921
Victor Stinner7931d9a2011-11-04 00:22:48 +010010922 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010923
10924 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010925
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010926 if (result == -2)
10927 return NULL;
10928
Christian Heimes217cfd12007-12-02 14:31:20 +000010929 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010930}
10931
10932static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010933unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010934{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010935 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10936 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010937 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010938 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010939}
10940
Guido van Rossumc2504932007-09-18 19:42:40 +000010941/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010942 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010943static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010944unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010945{
Guido van Rossumc2504932007-09-18 19:42:40 +000010946 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010010947 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010948
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010949 if (_PyUnicode_HASH(self) != -1)
10950 return _PyUnicode_HASH(self);
10951 if (PyUnicode_READY(self) == -1)
10952 return -1;
10953 len = PyUnicode_GET_LENGTH(self);
10954
10955 /* The hash function as a macro, gets expanded three times below. */
10956#define HASH(P) \
10957 x = (Py_uhash_t)*P << 7; \
10958 while (--len >= 0) \
10959 x = (1000003*x) ^ (Py_uhash_t)*P++;
10960
10961 switch (PyUnicode_KIND(self)) {
10962 case PyUnicode_1BYTE_KIND: {
10963 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10964 HASH(c);
10965 break;
10966 }
10967 case PyUnicode_2BYTE_KIND: {
10968 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10969 HASH(s);
10970 break;
10971 }
10972 default: {
10973 Py_UCS4 *l;
10974 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10975 "Impossible switch case in unicode_hash");
10976 l = PyUnicode_4BYTE_DATA(self);
10977 HASH(l);
10978 break;
10979 }
10980 }
10981 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
10982
Guido van Rossumc2504932007-09-18 19:42:40 +000010983 if (x == -1)
10984 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010985 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010986 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010987}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010988#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000010989
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010990PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010991 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010992\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010993Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010994
10995static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010996unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010997{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010998 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010999 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011000 Py_ssize_t start;
11001 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011002
Jesus Ceaac451502011-04-20 17:09:23 +020011003 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11004 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011005 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011006
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011007 if (PyUnicode_READY(self) == -1)
11008 return NULL;
11009 if (PyUnicode_READY(substring) == -1)
11010 return NULL;
11011
Victor Stinner7931d9a2011-11-04 00:22:48 +010011012 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011013
11014 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011015
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011016 if (result == -2)
11017 return NULL;
11018
Guido van Rossumd57fd912000-03-10 22:53:23 +000011019 if (result < 0) {
11020 PyErr_SetString(PyExc_ValueError, "substring not found");
11021 return NULL;
11022 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011023
Christian Heimes217cfd12007-12-02 14:31:20 +000011024 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011025}
11026
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011027PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011028 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011029\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011030Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011031at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011032
11033static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011034unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011035{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011036 Py_ssize_t i, length;
11037 int kind;
11038 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011039 int cased;
11040
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011041 if (PyUnicode_READY(self) == -1)
11042 return NULL;
11043 length = PyUnicode_GET_LENGTH(self);
11044 kind = PyUnicode_KIND(self);
11045 data = PyUnicode_DATA(self);
11046
Guido van Rossumd57fd912000-03-10 22:53:23 +000011047 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011048 if (length == 1)
11049 return PyBool_FromLong(
11050 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011051
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011052 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011053 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011054 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011055
Guido van Rossumd57fd912000-03-10 22:53:23 +000011056 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011057 for (i = 0; i < length; i++) {
11058 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011059
Benjamin Peterson29060642009-01-31 22:14:21 +000011060 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11061 return PyBool_FromLong(0);
11062 else if (!cased && Py_UNICODE_ISLOWER(ch))
11063 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011064 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011065 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011066}
11067
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011068PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011069 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011070\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011071Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011072at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011073
11074static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011075unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011076{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011077 Py_ssize_t i, length;
11078 int kind;
11079 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011080 int cased;
11081
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011082 if (PyUnicode_READY(self) == -1)
11083 return NULL;
11084 length = PyUnicode_GET_LENGTH(self);
11085 kind = PyUnicode_KIND(self);
11086 data = PyUnicode_DATA(self);
11087
Guido van Rossumd57fd912000-03-10 22:53:23 +000011088 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011089 if (length == 1)
11090 return PyBool_FromLong(
11091 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011092
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011093 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011094 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011095 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011096
Guido van Rossumd57fd912000-03-10 22:53:23 +000011097 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011098 for (i = 0; i < length; i++) {
11099 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011100
Benjamin Peterson29060642009-01-31 22:14:21 +000011101 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11102 return PyBool_FromLong(0);
11103 else if (!cased && Py_UNICODE_ISUPPER(ch))
11104 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011105 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011106 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011107}
11108
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011109PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011110 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011111\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011112Return True if S is a titlecased string and there is at least one\n\
11113character in S, i.e. upper- and titlecase characters may only\n\
11114follow uncased characters and lowercase characters only cased ones.\n\
11115Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011116
11117static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011118unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011119{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011120 Py_ssize_t i, length;
11121 int kind;
11122 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011123 int cased, previous_is_cased;
11124
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011125 if (PyUnicode_READY(self) == -1)
11126 return NULL;
11127 length = PyUnicode_GET_LENGTH(self);
11128 kind = PyUnicode_KIND(self);
11129 data = PyUnicode_DATA(self);
11130
Guido van Rossumd57fd912000-03-10 22:53:23 +000011131 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011132 if (length == 1) {
11133 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11134 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11135 (Py_UNICODE_ISUPPER(ch) != 0));
11136 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011137
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011138 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011139 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011140 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011141
Guido van Rossumd57fd912000-03-10 22:53:23 +000011142 cased = 0;
11143 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011144 for (i = 0; i < length; i++) {
11145 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011146
Benjamin Peterson29060642009-01-31 22:14:21 +000011147 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11148 if (previous_is_cased)
11149 return PyBool_FromLong(0);
11150 previous_is_cased = 1;
11151 cased = 1;
11152 }
11153 else if (Py_UNICODE_ISLOWER(ch)) {
11154 if (!previous_is_cased)
11155 return PyBool_FromLong(0);
11156 previous_is_cased = 1;
11157 cased = 1;
11158 }
11159 else
11160 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011161 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011162 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011163}
11164
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011165PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011166 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011167\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011168Return True if all characters in S are whitespace\n\
11169and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011170
11171static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011172unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011173{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011174 Py_ssize_t i, length;
11175 int kind;
11176 void *data;
11177
11178 if (PyUnicode_READY(self) == -1)
11179 return NULL;
11180 length = PyUnicode_GET_LENGTH(self);
11181 kind = PyUnicode_KIND(self);
11182 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011183
Guido van Rossumd57fd912000-03-10 22:53:23 +000011184 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011185 if (length == 1)
11186 return PyBool_FromLong(
11187 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011188
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011189 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011190 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011191 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011192
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011193 for (i = 0; i < length; i++) {
11194 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011195 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011196 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011197 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011198 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011199}
11200
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011201PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011202 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011203\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011204Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011205and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011206
11207static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011208unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011209{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011210 Py_ssize_t i, length;
11211 int kind;
11212 void *data;
11213
11214 if (PyUnicode_READY(self) == -1)
11215 return NULL;
11216 length = PyUnicode_GET_LENGTH(self);
11217 kind = PyUnicode_KIND(self);
11218 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011219
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011220 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011221 if (length == 1)
11222 return PyBool_FromLong(
11223 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011224
11225 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011226 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011227 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011228
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011229 for (i = 0; i < length; i++) {
11230 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011231 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011232 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011233 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011234}
11235
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011236PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011237 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011238\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011239Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011240and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011241
11242static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011243unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011244{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011245 int kind;
11246 void *data;
11247 Py_ssize_t len, i;
11248
11249 if (PyUnicode_READY(self) == -1)
11250 return NULL;
11251
11252 kind = PyUnicode_KIND(self);
11253 data = PyUnicode_DATA(self);
11254 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011255
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011256 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011257 if (len == 1) {
11258 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11259 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11260 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011261
11262 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011263 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011264 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011265
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011266 for (i = 0; i < len; i++) {
11267 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011268 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011269 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011270 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011271 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011272}
11273
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011274PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011275 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011276\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011277Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011278False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011279
11280static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011281unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011282{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011283 Py_ssize_t i, length;
11284 int kind;
11285 void *data;
11286
11287 if (PyUnicode_READY(self) == -1)
11288 return NULL;
11289 length = PyUnicode_GET_LENGTH(self);
11290 kind = PyUnicode_KIND(self);
11291 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011292
Guido van Rossumd57fd912000-03-10 22:53:23 +000011293 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011294 if (length == 1)
11295 return PyBool_FromLong(
11296 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011297
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011298 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011299 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011300 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011301
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011302 for (i = 0; i < length; i++) {
11303 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011304 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011305 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011306 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011307}
11308
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011309PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011310 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011311\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011312Return True if all characters in S are digits\n\
11313and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011314
11315static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011316unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011317{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011318 Py_ssize_t i, length;
11319 int kind;
11320 void *data;
11321
11322 if (PyUnicode_READY(self) == -1)
11323 return NULL;
11324 length = PyUnicode_GET_LENGTH(self);
11325 kind = PyUnicode_KIND(self);
11326 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011327
Guido van Rossumd57fd912000-03-10 22:53:23 +000011328 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011329 if (length == 1) {
11330 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11331 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11332 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011333
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011334 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011335 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011336 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011337
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011338 for (i = 0; i < length; i++) {
11339 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011340 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011341 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011342 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011343}
11344
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011345PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011346 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011347\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011348Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011349False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011350
11351static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011352unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011353{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011354 Py_ssize_t i, length;
11355 int kind;
11356 void *data;
11357
11358 if (PyUnicode_READY(self) == -1)
11359 return NULL;
11360 length = PyUnicode_GET_LENGTH(self);
11361 kind = PyUnicode_KIND(self);
11362 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011363
Guido van Rossumd57fd912000-03-10 22:53:23 +000011364 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011365 if (length == 1)
11366 return PyBool_FromLong(
11367 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011368
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011369 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011370 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011371 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011372
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011373 for (i = 0; i < length; i++) {
11374 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011375 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011376 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011377 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011378}
11379
Martin v. Löwis47383402007-08-15 07:32:56 +000011380int
11381PyUnicode_IsIdentifier(PyObject *self)
11382{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011383 int kind;
11384 void *data;
11385 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011386 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011387
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011388 if (PyUnicode_READY(self) == -1) {
11389 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011390 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011391 }
11392
11393 /* Special case for empty strings */
11394 if (PyUnicode_GET_LENGTH(self) == 0)
11395 return 0;
11396 kind = PyUnicode_KIND(self);
11397 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011398
11399 /* PEP 3131 says that the first character must be in
11400 XID_Start and subsequent characters in XID_Continue,
11401 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011402 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011403 letters, digits, underscore). However, given the current
11404 definition of XID_Start and XID_Continue, it is sufficient
11405 to check just for these, except that _ must be allowed
11406 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011407 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011408 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011409 return 0;
11410
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011411 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011412 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011413 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011414 return 1;
11415}
11416
11417PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011418 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011419\n\
11420Return True if S is a valid identifier according\n\
11421to the language definition.");
11422
11423static PyObject*
11424unicode_isidentifier(PyObject *self)
11425{
11426 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11427}
11428
Georg Brandl559e5d72008-06-11 18:37:52 +000011429PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011430 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011431\n\
11432Return True if all characters in S are considered\n\
11433printable in repr() or S is empty, False otherwise.");
11434
11435static PyObject*
11436unicode_isprintable(PyObject *self)
11437{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011438 Py_ssize_t i, length;
11439 int kind;
11440 void *data;
11441
11442 if (PyUnicode_READY(self) == -1)
11443 return NULL;
11444 length = PyUnicode_GET_LENGTH(self);
11445 kind = PyUnicode_KIND(self);
11446 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011447
11448 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011449 if (length == 1)
11450 return PyBool_FromLong(
11451 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011452
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011453 for (i = 0; i < length; i++) {
11454 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011455 Py_RETURN_FALSE;
11456 }
11457 }
11458 Py_RETURN_TRUE;
11459}
11460
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011461PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011462 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011463\n\
11464Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011465iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011466
11467static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011468unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011469{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011470 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011471}
11472
Martin v. Löwis18e16552006-02-15 17:27:45 +000011473static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011474unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011475{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011476 if (PyUnicode_READY(self) == -1)
11477 return -1;
11478 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011479}
11480
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011481PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011482 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011483\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011484Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011485done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011486
11487static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011488unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011489{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011490 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011491 Py_UCS4 fillchar = ' ';
11492
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011493 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011494 return NULL;
11495
Victor Stinnerc4b49542011-12-11 22:44:26 +010011496 if (PyUnicode_READY(self) < 0)
11497 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011498
Victor Stinnerc4b49542011-12-11 22:44:26 +010011499 if (PyUnicode_GET_LENGTH(self) >= width)
11500 return unicode_result_unchanged(self);
11501
11502 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011503}
11504
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011505PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011506 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011507\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011508Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011509
11510static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011511unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011512{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011513 return fixup(self, fixlower);
11514}
11515
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011516#define LEFTSTRIP 0
11517#define RIGHTSTRIP 1
11518#define BOTHSTRIP 2
11519
11520/* Arrays indexed by above */
11521static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11522
11523#define STRIPNAME(i) (stripformat[i]+3)
11524
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011525/* externally visible for str.strip(unicode) */
11526PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011527_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011528{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011529 void *data;
11530 int kind;
11531 Py_ssize_t i, j, len;
11532 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011533
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011534 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11535 return NULL;
11536
11537 kind = PyUnicode_KIND(self);
11538 data = PyUnicode_DATA(self);
11539 len = PyUnicode_GET_LENGTH(self);
11540 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11541 PyUnicode_DATA(sepobj),
11542 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011543
Benjamin Peterson14339b62009-01-31 16:36:08 +000011544 i = 0;
11545 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011546 while (i < len &&
11547 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011548 i++;
11549 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011550 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011551
Benjamin Peterson14339b62009-01-31 16:36:08 +000011552 j = len;
11553 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011554 do {
11555 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011556 } while (j >= i &&
11557 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011558 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011559 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011560
Victor Stinner7931d9a2011-11-04 00:22:48 +010011561 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011562}
11563
11564PyObject*
11565PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11566{
11567 unsigned char *data;
11568 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011569 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011570
Victor Stinnerde636f32011-10-01 03:55:54 +020011571 if (PyUnicode_READY(self) == -1)
11572 return NULL;
11573
11574 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11575
Victor Stinner12bab6d2011-10-01 01:53:49 +020011576 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Victor Stinnerc4b49542011-12-11 22:44:26 +010011577 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011578
Victor Stinner12bab6d2011-10-01 01:53:49 +020011579 length = end - start;
11580 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011581 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011582
Victor Stinnerde636f32011-10-01 03:55:54 +020011583 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011584 PyErr_SetString(PyExc_IndexError, "string index out of range");
11585 return NULL;
11586 }
11587
Victor Stinnerb9275c12011-10-05 14:01:42 +020011588 if (PyUnicode_IS_ASCII(self)) {
11589 kind = PyUnicode_KIND(self);
11590 data = PyUnicode_1BYTE_DATA(self);
11591 return unicode_fromascii(data + start, length);
11592 }
11593 else {
11594 kind = PyUnicode_KIND(self);
11595 data = PyUnicode_1BYTE_DATA(self);
11596 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011597 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011598 length);
11599 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011600}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011601
11602static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011603do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011604{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011605 int kind;
11606 void *data;
11607 Py_ssize_t len, i, j;
11608
11609 if (PyUnicode_READY(self) == -1)
11610 return NULL;
11611
11612 kind = PyUnicode_KIND(self);
11613 data = PyUnicode_DATA(self);
11614 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011615
Benjamin Peterson14339b62009-01-31 16:36:08 +000011616 i = 0;
11617 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011618 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011619 i++;
11620 }
11621 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011622
Benjamin Peterson14339b62009-01-31 16:36:08 +000011623 j = len;
11624 if (striptype != LEFTSTRIP) {
11625 do {
11626 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011627 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011628 j++;
11629 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011630
Victor Stinner7931d9a2011-11-04 00:22:48 +010011631 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011632}
11633
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011634
11635static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011636do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011637{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011638 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011639
Benjamin Peterson14339b62009-01-31 16:36:08 +000011640 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11641 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011642
Benjamin Peterson14339b62009-01-31 16:36:08 +000011643 if (sep != NULL && sep != Py_None) {
11644 if (PyUnicode_Check(sep))
11645 return _PyUnicode_XStrip(self, striptype, sep);
11646 else {
11647 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011648 "%s arg must be None or str",
11649 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011650 return NULL;
11651 }
11652 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011653
Benjamin Peterson14339b62009-01-31 16:36:08 +000011654 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011655}
11656
11657
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011658PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011659 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011660\n\
11661Return a copy of the string S with leading and trailing\n\
11662whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011663If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011664
11665static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011666unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011667{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011668 if (PyTuple_GET_SIZE(args) == 0)
11669 return do_strip(self, BOTHSTRIP); /* Common case */
11670 else
11671 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011672}
11673
11674
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011675PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011676 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011677\n\
11678Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011679If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011680
11681static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011682unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011683{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011684 if (PyTuple_GET_SIZE(args) == 0)
11685 return do_strip(self, LEFTSTRIP); /* Common case */
11686 else
11687 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011688}
11689
11690
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011691PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011692 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011693\n\
11694Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011695If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011696
11697static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011698unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011699{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011700 if (PyTuple_GET_SIZE(args) == 0)
11701 return do_strip(self, RIGHTSTRIP); /* Common case */
11702 else
11703 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011704}
11705
11706
Guido van Rossumd57fd912000-03-10 22:53:23 +000011707static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011708unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011709{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011710 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011711 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011712
Georg Brandl222de0f2009-04-12 12:01:50 +000011713 if (len < 1) {
11714 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011715 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011716 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011717
Victor Stinnerc4b49542011-12-11 22:44:26 +010011718 /* no repeat, return original string */
11719 if (len == 1)
11720 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000011721
Victor Stinnerc4b49542011-12-11 22:44:26 +010011722 if (PyUnicode_READY(str) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011723 return NULL;
11724
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011725 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011726 PyErr_SetString(PyExc_OverflowError,
11727 "repeated string is too long");
11728 return NULL;
11729 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011730 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011731
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011732 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011733 if (!u)
11734 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011735 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011736
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011737 if (PyUnicode_GET_LENGTH(str) == 1) {
11738 const int kind = PyUnicode_KIND(str);
11739 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11740 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011741 if (kind == PyUnicode_1BYTE_KIND)
11742 memset(to, (unsigned char)fill_char, len);
11743 else {
11744 for (n = 0; n < len; ++n)
11745 PyUnicode_WRITE(kind, to, n, fill_char);
11746 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011747 }
11748 else {
11749 /* number of characters copied this far */
11750 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011751 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011752 char *to = (char *) PyUnicode_DATA(u);
11753 Py_MEMCPY(to, PyUnicode_DATA(str),
11754 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011755 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011756 n = (done <= nchars-done) ? done : nchars-done;
11757 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011758 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011759 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011760 }
11761
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011762 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011763 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011764}
11765
Alexander Belopolsky40018472011-02-26 01:02:56 +000011766PyObject *
11767PyUnicode_Replace(PyObject *obj,
11768 PyObject *subobj,
11769 PyObject *replobj,
11770 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011771{
11772 PyObject *self;
11773 PyObject *str1;
11774 PyObject *str2;
11775 PyObject *result;
11776
11777 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011778 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011779 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011780 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011781 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011782 Py_DECREF(self);
11783 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011784 }
11785 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011786 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011787 Py_DECREF(self);
11788 Py_DECREF(str1);
11789 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011790 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011791 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011792 Py_DECREF(self);
11793 Py_DECREF(str1);
11794 Py_DECREF(str2);
11795 return result;
11796}
11797
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011798PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011799 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011800\n\
11801Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011802old replaced by new. If the optional argument count is\n\
11803given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011804
11805static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011806unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011807{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011808 PyObject *str1;
11809 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011810 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011811 PyObject *result;
11812
Martin v. Löwis18e16552006-02-15 17:27:45 +000011813 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011814 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011815 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011816 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011817 str1 = PyUnicode_FromObject(str1);
11818 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11819 return NULL;
11820 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011821 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011822 Py_DECREF(str1);
11823 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011824 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011825
11826 result = replace(self, str1, str2, maxcount);
11827
11828 Py_DECREF(str1);
11829 Py_DECREF(str2);
11830 return result;
11831}
11832
Alexander Belopolsky40018472011-02-26 01:02:56 +000011833static PyObject *
11834unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011835{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011836 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011837 Py_ssize_t isize;
11838 Py_ssize_t osize, squote, dquote, i, o;
11839 Py_UCS4 max, quote;
11840 int ikind, okind;
11841 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011842
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011843 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011844 return NULL;
11845
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011846 isize = PyUnicode_GET_LENGTH(unicode);
11847 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011848
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011849 /* Compute length of output, quote characters, and
11850 maximum character */
11851 osize = 2; /* quotes */
11852 max = 127;
11853 squote = dquote = 0;
11854 ikind = PyUnicode_KIND(unicode);
11855 for (i = 0; i < isize; i++) {
11856 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11857 switch (ch) {
11858 case '\'': squote++; osize++; break;
11859 case '"': dquote++; osize++; break;
11860 case '\\': case '\t': case '\r': case '\n':
11861 osize += 2; break;
11862 default:
11863 /* Fast-path ASCII */
11864 if (ch < ' ' || ch == 0x7f)
11865 osize += 4; /* \xHH */
11866 else if (ch < 0x7f)
11867 osize++;
11868 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11869 osize++;
11870 max = ch > max ? ch : max;
11871 }
11872 else if (ch < 0x100)
11873 osize += 4; /* \xHH */
11874 else if (ch < 0x10000)
11875 osize += 6; /* \uHHHH */
11876 else
11877 osize += 10; /* \uHHHHHHHH */
11878 }
11879 }
11880
11881 quote = '\'';
11882 if (squote) {
11883 if (dquote)
11884 /* Both squote and dquote present. Use squote,
11885 and escape them */
11886 osize += squote;
11887 else
11888 quote = '"';
11889 }
11890
11891 repr = PyUnicode_New(osize, max);
11892 if (repr == NULL)
11893 return NULL;
11894 okind = PyUnicode_KIND(repr);
11895 odata = PyUnicode_DATA(repr);
11896
11897 PyUnicode_WRITE(okind, odata, 0, quote);
11898 PyUnicode_WRITE(okind, odata, osize-1, quote);
11899
11900 for (i = 0, o = 1; i < isize; i++) {
11901 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011902
11903 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011904 if ((ch == quote) || (ch == '\\')) {
11905 PyUnicode_WRITE(okind, odata, o++, '\\');
11906 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011907 continue;
11908 }
11909
Benjamin Peterson29060642009-01-31 22:14:21 +000011910 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011911 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011912 PyUnicode_WRITE(okind, odata, o++, '\\');
11913 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011914 }
11915 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011916 PyUnicode_WRITE(okind, odata, o++, '\\');
11917 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011918 }
11919 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011920 PyUnicode_WRITE(okind, odata, o++, '\\');
11921 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011922 }
11923
11924 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011925 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011926 PyUnicode_WRITE(okind, odata, o++, '\\');
11927 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011928 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
11929 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011930 }
11931
Georg Brandl559e5d72008-06-11 18:37:52 +000011932 /* Copy ASCII characters as-is */
11933 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011934 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011935 }
11936
Benjamin Peterson29060642009-01-31 22:14:21 +000011937 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011938 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011939 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011940 (categories Z* and C* except ASCII space)
11941 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011942 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011943 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011944 if (ch <= 0xff) {
11945 PyUnicode_WRITE(okind, odata, o++, '\\');
11946 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011947 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
11948 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011949 }
11950 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011951 else if (ch >= 0x10000) {
11952 PyUnicode_WRITE(okind, odata, o++, '\\');
11953 PyUnicode_WRITE(okind, odata, o++, 'U');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011954 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
11955 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
11956 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
11957 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
11958 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
11959 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
11960 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
11961 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011962 }
11963 /* Map 16-bit characters to '\uxxxx' */
11964 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011965 PyUnicode_WRITE(okind, odata, o++, '\\');
11966 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011967 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
11968 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
11969 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
11970 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011971 }
11972 }
11973 /* Copy characters as-is */
11974 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011975 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011976 }
11977 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000011978 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011979 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020011980 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000011981 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011982}
11983
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011984PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011985 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011986\n\
11987Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011988such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011989arguments start and end are interpreted as in slice notation.\n\
11990\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011991Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011992
11993static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011994unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011995{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011996 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011997 Py_ssize_t start;
11998 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011999 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012000
Jesus Ceaac451502011-04-20 17:09:23 +020012001 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12002 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012003 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012004
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012005 if (PyUnicode_READY(self) == -1)
12006 return NULL;
12007 if (PyUnicode_READY(substring) == -1)
12008 return NULL;
12009
Victor Stinner7931d9a2011-11-04 00:22:48 +010012010 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012011
12012 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012013
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012014 if (result == -2)
12015 return NULL;
12016
Christian Heimes217cfd12007-12-02 14:31:20 +000012017 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012018}
12019
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012020PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012021 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012022\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012023Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012024
12025static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012026unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012027{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012028 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012029 Py_ssize_t start;
12030 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012031 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012032
Jesus Ceaac451502011-04-20 17:09:23 +020012033 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12034 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012035 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012036
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012037 if (PyUnicode_READY(self) == -1)
12038 return NULL;
12039 if (PyUnicode_READY(substring) == -1)
12040 return NULL;
12041
Victor Stinner7931d9a2011-11-04 00:22:48 +010012042 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012043
12044 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012045
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012046 if (result == -2)
12047 return NULL;
12048
Guido van Rossumd57fd912000-03-10 22:53:23 +000012049 if (result < 0) {
12050 PyErr_SetString(PyExc_ValueError, "substring not found");
12051 return NULL;
12052 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012053
Christian Heimes217cfd12007-12-02 14:31:20 +000012054 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012055}
12056
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012057PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012058 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012059\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012060Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012061done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012062
12063static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012064unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012065{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012066 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012067 Py_UCS4 fillchar = ' ';
12068
Victor Stinnere9a29352011-10-01 02:14:59 +020012069 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012070 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012071
Victor Stinnerc4b49542011-12-11 22:44:26 +010012072 if (PyUnicode_READY(self) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012073 return NULL;
12074
Victor Stinnerc4b49542011-12-11 22:44:26 +010012075 if (PyUnicode_GET_LENGTH(self) >= width)
12076 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012077
Victor Stinnerc4b49542011-12-11 22:44:26 +010012078 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012079}
12080
Alexander Belopolsky40018472011-02-26 01:02:56 +000012081PyObject *
12082PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012083{
12084 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012085
Guido van Rossumd57fd912000-03-10 22:53:23 +000012086 s = PyUnicode_FromObject(s);
12087 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012088 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012089 if (sep != NULL) {
12090 sep = PyUnicode_FromObject(sep);
12091 if (sep == NULL) {
12092 Py_DECREF(s);
12093 return NULL;
12094 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012095 }
12096
Victor Stinner9310abb2011-10-05 00:59:23 +020012097 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012098
12099 Py_DECREF(s);
12100 Py_XDECREF(sep);
12101 return result;
12102}
12103
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012104PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012105 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012106\n\
12107Return a list of the words in S, using sep as the\n\
12108delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012109splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012110whitespace string is a separator and empty strings are\n\
12111removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012112
12113static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012114unicode_split(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012115{
12116 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012117 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012118
Martin v. Löwis18e16552006-02-15 17:27:45 +000012119 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012120 return NULL;
12121
12122 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012123 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012124 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012125 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012126 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012127 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012128}
12129
Thomas Wouters477c8d52006-05-27 19:21:47 +000012130PyObject *
12131PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12132{
12133 PyObject* str_obj;
12134 PyObject* sep_obj;
12135 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012136 int kind1, kind2, kind;
12137 void *buf1 = NULL, *buf2 = NULL;
12138 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012139
12140 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020012141 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012142 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012143 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012144 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012145 Py_DECREF(str_obj);
12146 return NULL;
12147 }
12148
Victor Stinner14f8f022011-10-05 20:58:25 +020012149 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012150 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012151 kind = Py_MAX(kind1, kind2);
12152 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012153 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012154 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012155 if (!buf1)
12156 goto onError;
12157 buf2 = PyUnicode_DATA(sep_obj);
12158 if (kind2 != kind)
12159 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12160 if (!buf2)
12161 goto onError;
12162 len1 = PyUnicode_GET_LENGTH(str_obj);
12163 len2 = PyUnicode_GET_LENGTH(sep_obj);
12164
Victor Stinner14f8f022011-10-05 20:58:25 +020012165 switch(PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012166 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012167 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12168 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12169 else
12170 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012171 break;
12172 case PyUnicode_2BYTE_KIND:
12173 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12174 break;
12175 case PyUnicode_4BYTE_KIND:
12176 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12177 break;
12178 default:
12179 assert(0);
12180 out = 0;
12181 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012182
12183 Py_DECREF(sep_obj);
12184 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012185 if (kind1 != kind)
12186 PyMem_Free(buf1);
12187 if (kind2 != kind)
12188 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012189
12190 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012191 onError:
12192 Py_DECREF(sep_obj);
12193 Py_DECREF(str_obj);
12194 if (kind1 != kind && buf1)
12195 PyMem_Free(buf1);
12196 if (kind2 != kind && buf2)
12197 PyMem_Free(buf2);
12198 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012199}
12200
12201
12202PyObject *
12203PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12204{
12205 PyObject* str_obj;
12206 PyObject* sep_obj;
12207 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012208 int kind1, kind2, kind;
12209 void *buf1 = NULL, *buf2 = NULL;
12210 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012211
12212 str_obj = PyUnicode_FromObject(str_in);
12213 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012214 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012215 sep_obj = PyUnicode_FromObject(sep_in);
12216 if (!sep_obj) {
12217 Py_DECREF(str_obj);
12218 return NULL;
12219 }
12220
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012221 kind1 = PyUnicode_KIND(str_in);
12222 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012223 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012224 buf1 = PyUnicode_DATA(str_in);
12225 if (kind1 != kind)
12226 buf1 = _PyUnicode_AsKind(str_in, kind);
12227 if (!buf1)
12228 goto onError;
12229 buf2 = PyUnicode_DATA(sep_obj);
12230 if (kind2 != kind)
12231 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12232 if (!buf2)
12233 goto onError;
12234 len1 = PyUnicode_GET_LENGTH(str_obj);
12235 len2 = PyUnicode_GET_LENGTH(sep_obj);
12236
12237 switch(PyUnicode_KIND(str_in)) {
12238 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012239 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12240 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12241 else
12242 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012243 break;
12244 case PyUnicode_2BYTE_KIND:
12245 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12246 break;
12247 case PyUnicode_4BYTE_KIND:
12248 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12249 break;
12250 default:
12251 assert(0);
12252 out = 0;
12253 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012254
12255 Py_DECREF(sep_obj);
12256 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012257 if (kind1 != kind)
12258 PyMem_Free(buf1);
12259 if (kind2 != kind)
12260 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012261
12262 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012263 onError:
12264 Py_DECREF(sep_obj);
12265 Py_DECREF(str_obj);
12266 if (kind1 != kind && buf1)
12267 PyMem_Free(buf1);
12268 if (kind2 != kind && buf2)
12269 PyMem_Free(buf2);
12270 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012271}
12272
12273PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012274 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012275\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012276Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012277the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012278found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012279
12280static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012281unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012282{
Victor Stinner9310abb2011-10-05 00:59:23 +020012283 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012284}
12285
12286PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012287 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012288\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012289Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012290the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012291separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012292
12293static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012294unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012295{
Victor Stinner9310abb2011-10-05 00:59:23 +020012296 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012297}
12298
Alexander Belopolsky40018472011-02-26 01:02:56 +000012299PyObject *
12300PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012301{
12302 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012303
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012304 s = PyUnicode_FromObject(s);
12305 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012306 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012307 if (sep != NULL) {
12308 sep = PyUnicode_FromObject(sep);
12309 if (sep == NULL) {
12310 Py_DECREF(s);
12311 return NULL;
12312 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012313 }
12314
Victor Stinner9310abb2011-10-05 00:59:23 +020012315 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012316
12317 Py_DECREF(s);
12318 Py_XDECREF(sep);
12319 return result;
12320}
12321
12322PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012323 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012324\n\
12325Return a list of the words in S, using sep as the\n\
12326delimiter string, starting at the end of the string and\n\
12327working to the front. If maxsplit is given, at most maxsplit\n\
12328splits are done. If sep is not specified, any whitespace string\n\
12329is a separator.");
12330
12331static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012332unicode_rsplit(PyObject *self, PyObject *args)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012333{
12334 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012335 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012336
Martin v. Löwis18e16552006-02-15 17:27:45 +000012337 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012338 return NULL;
12339
12340 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012341 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012342 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012343 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012344 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012345 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012346}
12347
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012348PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012349 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012350\n\
12351Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012352Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012353is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012354
12355static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012356unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012357{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012358 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012359 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012360
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012361 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12362 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012363 return NULL;
12364
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012365 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012366}
12367
12368static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012369PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012370{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012371 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012372}
12373
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012374PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012375 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012376\n\
12377Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012378and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012379
12380static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012381unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012382{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012383 return fixup(self, fixswapcase);
12384}
12385
Georg Brandlceee0772007-11-27 23:48:05 +000012386PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012387 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012388\n\
12389Return a translation table usable for str.translate().\n\
12390If there is only one argument, it must be a dictionary mapping Unicode\n\
12391ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012392Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012393If there are two arguments, they must be strings of equal length, and\n\
12394in the resulting dictionary, each character in x will be mapped to the\n\
12395character at the same position in y. If there is a third argument, it\n\
12396must be a string, whose characters will be mapped to None in the result.");
12397
12398static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012399unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012400{
12401 PyObject *x, *y = NULL, *z = NULL;
12402 PyObject *new = NULL, *key, *value;
12403 Py_ssize_t i = 0;
12404 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012405
Georg Brandlceee0772007-11-27 23:48:05 +000012406 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12407 return NULL;
12408 new = PyDict_New();
12409 if (!new)
12410 return NULL;
12411 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012412 int x_kind, y_kind, z_kind;
12413 void *x_data, *y_data, *z_data;
12414
Georg Brandlceee0772007-11-27 23:48:05 +000012415 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012416 if (!PyUnicode_Check(x)) {
12417 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12418 "be a string if there is a second argument");
12419 goto err;
12420 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012421 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012422 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12423 "arguments must have equal length");
12424 goto err;
12425 }
12426 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012427 x_kind = PyUnicode_KIND(x);
12428 y_kind = PyUnicode_KIND(y);
12429 x_data = PyUnicode_DATA(x);
12430 y_data = PyUnicode_DATA(y);
12431 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12432 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12433 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012434 if (!key || !value)
12435 goto err;
12436 res = PyDict_SetItem(new, key, value);
12437 Py_DECREF(key);
12438 Py_DECREF(value);
12439 if (res < 0)
12440 goto err;
12441 }
12442 /* create entries for deleting chars in z */
12443 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012444 z_kind = PyUnicode_KIND(z);
12445 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012446 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012447 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012448 if (!key)
12449 goto err;
12450 res = PyDict_SetItem(new, key, Py_None);
12451 Py_DECREF(key);
12452 if (res < 0)
12453 goto err;
12454 }
12455 }
12456 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012457 int kind;
12458 void *data;
12459
Georg Brandlceee0772007-11-27 23:48:05 +000012460 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012461 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012462 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12463 "to maketrans it must be a dict");
12464 goto err;
12465 }
12466 /* copy entries into the new dict, converting string keys to int keys */
12467 while (PyDict_Next(x, &i, &key, &value)) {
12468 if (PyUnicode_Check(key)) {
12469 /* convert string keys to integer keys */
12470 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012471 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012472 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12473 "table must be of length 1");
12474 goto err;
12475 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012476 kind = PyUnicode_KIND(key);
12477 data = PyUnicode_DATA(key);
12478 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012479 if (!newkey)
12480 goto err;
12481 res = PyDict_SetItem(new, newkey, value);
12482 Py_DECREF(newkey);
12483 if (res < 0)
12484 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012485 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012486 /* just keep integer keys */
12487 if (PyDict_SetItem(new, key, value) < 0)
12488 goto err;
12489 } else {
12490 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12491 "be strings or integers");
12492 goto err;
12493 }
12494 }
12495 }
12496 return new;
12497 err:
12498 Py_DECREF(new);
12499 return NULL;
12500}
12501
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012502PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012503 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012504\n\
12505Return a copy of the string S, where all characters have been mapped\n\
12506through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012507Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012508Unmapped characters are left untouched. Characters mapped to None\n\
12509are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012510
12511static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012512unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012513{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012514 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012515}
12516
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012517PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012518 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012519\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012520Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012521
12522static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012523unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012524{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012525 return fixup(self, fixupper);
12526}
12527
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012528PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012529 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012530\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012531Pad a numeric string S with zeros on the left, to fill a field\n\
12532of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012533
12534static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012535unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012536{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012537 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012538 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012539 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012540 int kind;
12541 void *data;
12542 Py_UCS4 chr;
12543
Martin v. Löwis18e16552006-02-15 17:27:45 +000012544 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012545 return NULL;
12546
Victor Stinnerc4b49542011-12-11 22:44:26 +010012547 if (PyUnicode_READY(self) < 0)
12548 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012549
Victor Stinnerc4b49542011-12-11 22:44:26 +010012550 if (PyUnicode_GET_LENGTH(self) >= width)
12551 return unicode_result_unchanged(self);
12552
12553 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012554
12555 u = pad(self, fill, 0, '0');
12556
Walter Dörwald068325e2002-04-15 13:36:47 +000012557 if (u == NULL)
12558 return NULL;
12559
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012560 kind = PyUnicode_KIND(u);
12561 data = PyUnicode_DATA(u);
12562 chr = PyUnicode_READ(kind, data, fill);
12563
12564 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012565 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012566 PyUnicode_WRITE(kind, data, 0, chr);
12567 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012568 }
12569
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012570 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012571 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012572}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012573
12574#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012575static PyObject *
12576unicode__decimal2ascii(PyObject *self)
12577{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012578 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012579}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012580#endif
12581
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012582PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012583 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012584\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012585Return True if S starts with the specified prefix, False otherwise.\n\
12586With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012587With optional end, stop comparing S at that position.\n\
12588prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012589
12590static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012591unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012592 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012593{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012594 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012595 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012596 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012597 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012598 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012599
Jesus Ceaac451502011-04-20 17:09:23 +020012600 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012601 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012602 if (PyTuple_Check(subobj)) {
12603 Py_ssize_t i;
12604 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012605 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012606 if (substring == NULL)
12607 return NULL;
12608 result = tailmatch(self, substring, start, end, -1);
12609 Py_DECREF(substring);
12610 if (result) {
12611 Py_RETURN_TRUE;
12612 }
12613 }
12614 /* nothing matched */
12615 Py_RETURN_FALSE;
12616 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012617 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012618 if (substring == NULL) {
12619 if (PyErr_ExceptionMatches(PyExc_TypeError))
12620 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12621 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012622 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012623 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012624 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012625 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012626 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012627}
12628
12629
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012630PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012631 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012632\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012633Return True if S ends with the specified suffix, False otherwise.\n\
12634With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012635With optional end, stop comparing S at that position.\n\
12636suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012637
12638static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012639unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012640 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012641{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012642 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012643 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012644 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012645 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012646 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012647
Jesus Ceaac451502011-04-20 17:09:23 +020012648 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012649 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012650 if (PyTuple_Check(subobj)) {
12651 Py_ssize_t i;
12652 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012653 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012654 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012655 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012656 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012657 result = tailmatch(self, substring, start, end, +1);
12658 Py_DECREF(substring);
12659 if (result) {
12660 Py_RETURN_TRUE;
12661 }
12662 }
12663 Py_RETURN_FALSE;
12664 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012665 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012666 if (substring == NULL) {
12667 if (PyErr_ExceptionMatches(PyExc_TypeError))
12668 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12669 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012670 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012671 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012672 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012673 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012674 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012675}
12676
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012677#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012678
12679PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012680 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012681\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012682Return a formatted version of S, using substitutions from args and kwargs.\n\
12683The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012684
Eric Smith27bbca62010-11-04 17:06:58 +000012685PyDoc_STRVAR(format_map__doc__,
12686 "S.format_map(mapping) -> str\n\
12687\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012688Return a formatted version of S, using substitutions from mapping.\n\
12689The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012690
Eric Smith4a7d76d2008-05-30 18:10:19 +000012691static PyObject *
12692unicode__format__(PyObject* self, PyObject* args)
12693{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012694 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012695
12696 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12697 return NULL;
12698
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012699 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012700 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012701 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012702}
12703
Eric Smith8c663262007-08-25 02:26:07 +000012704PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012705 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012706\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012707Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012708
12709static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012710unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012711{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012712 Py_ssize_t size;
12713
12714 /* If it's a compact object, account for base structure +
12715 character data. */
12716 if (PyUnicode_IS_COMPACT_ASCII(v))
12717 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12718 else if (PyUnicode_IS_COMPACT(v))
12719 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012720 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012721 else {
12722 /* If it is a two-block object, account for base object, and
12723 for character block if present. */
12724 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012725 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012726 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012727 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012728 }
12729 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012730 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012731 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012732 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012733 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012734 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012735
12736 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012737}
12738
12739PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012740 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012741
12742static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012743unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012744{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010012745 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012746 if (!copy)
12747 return NULL;
12748 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012749}
12750
Guido van Rossumd57fd912000-03-10 22:53:23 +000012751static PyMethodDef unicode_methods[] = {
12752
12753 /* Order is according to common usage: often used methods should
12754 appear first, since lookup is done sequentially. */
12755
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012756 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012757 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12758 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012759 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012760 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12761 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12762 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12763 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12764 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12765 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12766 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012767 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012768 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12769 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12770 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012771 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012772 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12773 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12774 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012775 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012776 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012777 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012778 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012779 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12780 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12781 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12782 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12783 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12784 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12785 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12786 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12787 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12788 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12789 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12790 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12791 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12792 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012793 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012794 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012795 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012796 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012797 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012798 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012799 {"maketrans", (PyCFunction) unicode_maketrans,
12800 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012801 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012802#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012803 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012804#endif
12805
12806#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012807 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012808 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012809#endif
12810
Benjamin Peterson14339b62009-01-31 16:36:08 +000012811 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012812 {NULL, NULL}
12813};
12814
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012815static PyObject *
12816unicode_mod(PyObject *v, PyObject *w)
12817{
Brian Curtindfc80e32011-08-10 20:28:54 -050012818 if (!PyUnicode_Check(v))
12819 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012820 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012821}
12822
12823static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012824 0, /*nb_add*/
12825 0, /*nb_subtract*/
12826 0, /*nb_multiply*/
12827 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012828};
12829
Guido van Rossumd57fd912000-03-10 22:53:23 +000012830static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012831 (lenfunc) unicode_length, /* sq_length */
12832 PyUnicode_Concat, /* sq_concat */
12833 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12834 (ssizeargfunc) unicode_getitem, /* sq_item */
12835 0, /* sq_slice */
12836 0, /* sq_ass_item */
12837 0, /* sq_ass_slice */
12838 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012839};
12840
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012841static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012842unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012843{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012844 if (PyUnicode_READY(self) == -1)
12845 return NULL;
12846
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000012847 if (PyIndex_Check(item)) {
12848 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012849 if (i == -1 && PyErr_Occurred())
12850 return NULL;
12851 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012852 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012853 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012854 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000012855 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012856 PyObject *result;
12857 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012858 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012859 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012860
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012861 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000012862 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012863 return NULL;
12864 }
12865
12866 if (slicelength <= 0) {
Victor Stinner382955f2011-12-11 21:44:00 +010012867 Py_INCREF(unicode_empty);
12868 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012869 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010012870 slicelength == PyUnicode_GET_LENGTH(self)) {
12871 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000012872 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012873 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020012874 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012875 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012876 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012877 src_kind = PyUnicode_KIND(self);
12878 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020012879 if (!PyUnicode_IS_ASCII(self)) {
12880 kind_limit = kind_maxchar_limit(src_kind);
12881 max_char = 0;
12882 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12883 ch = PyUnicode_READ(src_kind, src_data, cur);
12884 if (ch > max_char) {
12885 max_char = ch;
12886 if (max_char >= kind_limit)
12887 break;
12888 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012889 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012890 }
Victor Stinner55c99112011-10-13 01:17:06 +020012891 else
12892 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012893 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012894 if (result == NULL)
12895 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012896 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012897 dest_data = PyUnicode_DATA(result);
12898
12899 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012900 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
12901 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012902 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012903 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012904 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012905 } else {
12906 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12907 return NULL;
12908 }
12909}
12910
12911static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012912 (lenfunc)unicode_length, /* mp_length */
12913 (binaryfunc)unicode_subscript, /* mp_subscript */
12914 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012915};
12916
Guido van Rossumd57fd912000-03-10 22:53:23 +000012917
Guido van Rossumd57fd912000-03-10 22:53:23 +000012918/* Helpers for PyUnicode_Format() */
12919
12920static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000012921getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012922{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012923 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012924 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012925 (*p_argidx)++;
12926 if (arglen < 0)
12927 return args;
12928 else
12929 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012930 }
12931 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012932 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012933 return NULL;
12934}
12935
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012936/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012937
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012938static PyObject *
12939formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012940{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012941 char *p;
12942 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012943 double x;
Tim Petersced69f82003-09-16 20:30:58 +000012944
Guido van Rossumd57fd912000-03-10 22:53:23 +000012945 x = PyFloat_AsDouble(v);
12946 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012947 return NULL;
12948
Guido van Rossumd57fd912000-03-10 22:53:23 +000012949 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012950 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000012951
Eric Smith0923d1d2009-04-16 20:16:10 +000012952 p = PyOS_double_to_string(x, type, prec,
12953 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012954 if (p == NULL)
12955 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012956 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000012957 PyMem_Free(p);
12958 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012959}
12960
Tim Peters38fd5b62000-09-21 05:43:11 +000012961static PyObject*
12962formatlong(PyObject *val, int flags, int prec, int type)
12963{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012964 char *buf;
12965 int len;
12966 PyObject *str; /* temporary string object. */
12967 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012968
Benjamin Peterson14339b62009-01-31 16:36:08 +000012969 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
12970 if (!str)
12971 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012972 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012973 Py_DECREF(str);
12974 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012975}
12976
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012977static Py_UCS4
12978formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012979{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012980 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012981 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012982 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012983 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000012984 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012985 goto onError;
12986 }
12987 else {
12988 /* Integer input truncated to a character */
12989 long x;
12990 x = PyLong_AsLong(v);
12991 if (x == -1 && PyErr_Occurred())
12992 goto onError;
12993
Victor Stinner8faf8212011-12-08 22:14:11 +010012994 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012995 PyErr_SetString(PyExc_OverflowError,
12996 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012997 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012998 }
12999
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013000 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013001 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013002
Benjamin Peterson29060642009-01-31 22:14:21 +000013003 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013004 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013005 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013006 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013007}
13008
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013009static int
13010repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count)
13011{
13012 int r;
13013 assert(count > 0);
13014 assert(PyUnicode_Check(obj));
13015 if (count > 5) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013016 PyObject *repeated = unicode_repeat(obj, count);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013017 if (repeated == NULL)
13018 return -1;
13019 r = _PyAccu_Accumulate(acc, repeated);
13020 Py_DECREF(repeated);
13021 return r;
13022 }
13023 else {
13024 do {
13025 if (_PyAccu_Accumulate(acc, obj))
13026 return -1;
13027 } while (--count);
13028 return 0;
13029 }
13030}
13031
Alexander Belopolsky40018472011-02-26 01:02:56 +000013032PyObject *
13033PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013034{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013035 void *fmt;
13036 int fmtkind;
13037 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013038 int kind;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013039 int r;
13040 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013041 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013042 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013043 PyObject *temp = NULL;
13044 PyObject *second = NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013045 PyObject *uformat;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013046 _PyAccu acc;
13047 static PyObject *plus, *minus, *blank, *zero, *percent;
13048
13049 if (!plus && !(plus = get_latin1_char('+')))
13050 return NULL;
13051 if (!minus && !(minus = get_latin1_char('-')))
13052 return NULL;
13053 if (!blank && !(blank = get_latin1_char(' ')))
13054 return NULL;
13055 if (!zero && !(zero = get_latin1_char('0')))
13056 return NULL;
13057 if (!percent && !(percent = get_latin1_char('%')))
13058 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000013059
Guido van Rossumd57fd912000-03-10 22:53:23 +000013060 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013061 PyErr_BadInternalCall();
13062 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013063 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013064 uformat = PyUnicode_FromObject(format);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013065 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013066 return NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013067 if (_PyAccu_Init(&acc))
13068 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013069 fmt = PyUnicode_DATA(uformat);
13070 fmtkind = PyUnicode_KIND(uformat);
13071 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13072 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013073
Guido van Rossumd57fd912000-03-10 22:53:23 +000013074 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013075 arglen = PyTuple_Size(args);
13076 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013077 }
13078 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013079 arglen = -1;
13080 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013081 }
Christian Heimes90aa7642007-12-19 02:45:37 +000013082 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000013083 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013084 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013085
13086 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013087 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013088 PyObject *nonfmt;
13089 Py_ssize_t nonfmtpos;
13090 nonfmtpos = fmtpos++;
13091 while (fmtcnt >= 0 &&
13092 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13093 fmtpos++;
13094 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013095 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013096 nonfmt = PyUnicode_Substring(uformat, nonfmtpos, fmtpos);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013097 if (nonfmt == NULL)
13098 goto onError;
13099 r = _PyAccu_Accumulate(&acc, nonfmt);
13100 Py_DECREF(nonfmt);
13101 if (r)
13102 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013103 }
13104 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013105 /* Got a format specifier */
13106 int flags = 0;
13107 Py_ssize_t width = -1;
13108 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013109 Py_UCS4 c = '\0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013110 Py_UCS4 fill, sign;
Benjamin Peterson29060642009-01-31 22:14:21 +000013111 int isnumok;
13112 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013113 void *pbuf = NULL;
13114 Py_ssize_t pindex, len;
13115 PyObject *signobj = NULL, *fillobj = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013116
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013117 fmtpos++;
13118 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
13119 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013120 Py_ssize_t keylen;
13121 PyObject *key;
13122 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013123
Benjamin Peterson29060642009-01-31 22:14:21 +000013124 if (dict == NULL) {
13125 PyErr_SetString(PyExc_TypeError,
13126 "format requires a mapping");
13127 goto onError;
13128 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013129 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013130 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013131 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013132 /* Skip over balanced parentheses */
13133 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013134 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013135 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013136 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013137 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013138 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013139 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013140 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013141 if (fmtcnt < 0 || pcount > 0) {
13142 PyErr_SetString(PyExc_ValueError,
13143 "incomplete format key");
13144 goto onError;
13145 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013146 key = PyUnicode_Substring(uformat,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013147 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013148 if (key == NULL)
13149 goto onError;
13150 if (args_owned) {
13151 Py_DECREF(args);
13152 args_owned = 0;
13153 }
13154 args = PyObject_GetItem(dict, key);
13155 Py_DECREF(key);
13156 if (args == NULL) {
13157 goto onError;
13158 }
13159 args_owned = 1;
13160 arglen = -1;
13161 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013162 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013163 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013164 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013165 case '-': flags |= F_LJUST; continue;
13166 case '+': flags |= F_SIGN; continue;
13167 case ' ': flags |= F_BLANK; continue;
13168 case '#': flags |= F_ALT; continue;
13169 case '0': flags |= F_ZERO; continue;
13170 }
13171 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013172 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013173 if (c == '*') {
13174 v = getnextarg(args, arglen, &argidx);
13175 if (v == NULL)
13176 goto onError;
13177 if (!PyLong_Check(v)) {
13178 PyErr_SetString(PyExc_TypeError,
13179 "* wants int");
13180 goto onError;
13181 }
13182 width = PyLong_AsLong(v);
13183 if (width == -1 && PyErr_Occurred())
13184 goto onError;
13185 if (width < 0) {
13186 flags |= F_LJUST;
13187 width = -width;
13188 }
13189 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013190 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013191 }
13192 else if (c >= '0' && c <= '9') {
13193 width = c - '0';
13194 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013195 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013196 if (c < '0' || c > '9')
13197 break;
13198 if ((width*10) / 10 != width) {
13199 PyErr_SetString(PyExc_ValueError,
13200 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013201 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013202 }
13203 width = width*10 + (c - '0');
13204 }
13205 }
13206 if (c == '.') {
13207 prec = 0;
13208 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013209 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013210 if (c == '*') {
13211 v = getnextarg(args, arglen, &argidx);
13212 if (v == NULL)
13213 goto onError;
13214 if (!PyLong_Check(v)) {
13215 PyErr_SetString(PyExc_TypeError,
13216 "* wants int");
13217 goto onError;
13218 }
13219 prec = PyLong_AsLong(v);
13220 if (prec == -1 && PyErr_Occurred())
13221 goto onError;
13222 if (prec < 0)
13223 prec = 0;
13224 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013225 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013226 }
13227 else if (c >= '0' && c <= '9') {
13228 prec = c - '0';
13229 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013230 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013231 if (c < '0' || c > '9')
13232 break;
13233 if ((prec*10) / 10 != prec) {
13234 PyErr_SetString(PyExc_ValueError,
13235 "prec too big");
13236 goto onError;
13237 }
13238 prec = prec*10 + (c - '0');
13239 }
13240 }
13241 } /* prec */
13242 if (fmtcnt >= 0) {
13243 if (c == 'h' || c == 'l' || c == 'L') {
13244 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013245 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013246 }
13247 }
13248 if (fmtcnt < 0) {
13249 PyErr_SetString(PyExc_ValueError,
13250 "incomplete format");
13251 goto onError;
13252 }
13253 if (c != '%') {
13254 v = getnextarg(args, arglen, &argidx);
13255 if (v == NULL)
13256 goto onError;
13257 }
13258 sign = 0;
13259 fill = ' ';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013260 fillobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013261 switch (c) {
13262
13263 case '%':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013264 _PyAccu_Accumulate(&acc, percent);
13265 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013266
13267 case 's':
13268 case 'r':
13269 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000013270 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013271 temp = v;
13272 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013273 }
13274 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013275 if (c == 's')
13276 temp = PyObject_Str(v);
13277 else if (c == 'r')
13278 temp = PyObject_Repr(v);
13279 else
13280 temp = PyObject_ASCII(v);
13281 if (temp == NULL)
13282 goto onError;
13283 if (PyUnicode_Check(temp))
13284 /* nothing to do */;
13285 else {
13286 Py_DECREF(temp);
13287 PyErr_SetString(PyExc_TypeError,
13288 "%s argument has non-string str()");
13289 goto onError;
13290 }
13291 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013292 if (PyUnicode_READY(temp) == -1) {
13293 Py_CLEAR(temp);
13294 goto onError;
13295 }
13296 pbuf = PyUnicode_DATA(temp);
13297 kind = PyUnicode_KIND(temp);
13298 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013299 if (prec >= 0 && len > prec)
13300 len = prec;
13301 break;
13302
13303 case 'i':
13304 case 'd':
13305 case 'u':
13306 case 'o':
13307 case 'x':
13308 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000013309 isnumok = 0;
13310 if (PyNumber_Check(v)) {
13311 PyObject *iobj=NULL;
13312
13313 if (PyLong_Check(v)) {
13314 iobj = v;
13315 Py_INCREF(iobj);
13316 }
13317 else {
13318 iobj = PyNumber_Long(v);
13319 }
13320 if (iobj!=NULL) {
13321 if (PyLong_Check(iobj)) {
13322 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013323 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013324 Py_DECREF(iobj);
13325 if (!temp)
13326 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013327 if (PyUnicode_READY(temp) == -1) {
13328 Py_CLEAR(temp);
13329 goto onError;
13330 }
13331 pbuf = PyUnicode_DATA(temp);
13332 kind = PyUnicode_KIND(temp);
13333 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013334 sign = 1;
13335 }
13336 else {
13337 Py_DECREF(iobj);
13338 }
13339 }
13340 }
13341 if (!isnumok) {
13342 PyErr_Format(PyExc_TypeError,
13343 "%%%c format: a number is required, "
13344 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13345 goto onError;
13346 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013347 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013348 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013349 fillobj = zero;
13350 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013351 break;
13352
13353 case 'e':
13354 case 'E':
13355 case 'f':
13356 case 'F':
13357 case 'g':
13358 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013359 temp = formatfloat(v, flags, prec, c);
13360 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000013361 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013362 if (PyUnicode_READY(temp) == -1) {
13363 Py_CLEAR(temp);
13364 goto onError;
13365 }
13366 pbuf = PyUnicode_DATA(temp);
13367 kind = PyUnicode_KIND(temp);
13368 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013369 sign = 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013370 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013371 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013372 fillobj = zero;
13373 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013374 break;
13375
13376 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013377 {
13378 Py_UCS4 ch = formatchar(v);
13379 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013380 goto onError;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013381 temp = _PyUnicode_FromUCS4(&ch, 1);
13382 if (temp == NULL)
13383 goto onError;
13384 pbuf = PyUnicode_DATA(temp);
13385 kind = PyUnicode_KIND(temp);
13386 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013387 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013388 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013389
13390 default:
13391 PyErr_Format(PyExc_ValueError,
13392 "unsupported format character '%c' (0x%x) "
13393 "at index %zd",
13394 (31<=c && c<=126) ? (char)c : '?',
13395 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013396 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013397 goto onError;
13398 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013399 /* pbuf is initialized here. */
13400 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013401 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013402 if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
13403 signobj = minus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013404 len--;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013405 pindex++;
13406 }
13407 else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
13408 signobj = plus;
13409 len--;
13410 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013411 }
13412 else if (flags & F_SIGN)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013413 signobj = plus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013414 else if (flags & F_BLANK)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013415 signobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013416 else
13417 sign = 0;
13418 }
13419 if (width < len)
13420 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013421 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013422 if (fill != ' ') {
13423 assert(signobj != NULL);
13424 if (_PyAccu_Accumulate(&acc, signobj))
13425 goto onError;
13426 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013427 if (width > len)
13428 width--;
13429 }
13430 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013431 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013432 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013433 if (fill != ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013434 second = get_latin1_char(
13435 PyUnicode_READ(kind, pbuf, pindex + 1));
13436 pindex += 2;
13437 if (second == NULL ||
13438 _PyAccu_Accumulate(&acc, zero) ||
13439 _PyAccu_Accumulate(&acc, second))
13440 goto onError;
13441 Py_CLEAR(second);
Benjamin Peterson29060642009-01-31 22:14:21 +000013442 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013443 width -= 2;
13444 if (width < 0)
13445 width = 0;
13446 len -= 2;
13447 }
13448 if (width > len && !(flags & F_LJUST)) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013449 assert(fillobj != NULL);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013450 if (repeat_accumulate(&acc, fillobj, width - len))
13451 goto onError;
13452 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013453 }
13454 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013455 if (sign) {
13456 assert(signobj != NULL);
13457 if (_PyAccu_Accumulate(&acc, signobj))
13458 goto onError;
13459 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013460 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013461 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13462 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013463 second = get_latin1_char(
13464 PyUnicode_READ(kind, pbuf, pindex + 1));
13465 pindex += 2;
13466 if (second == NULL ||
13467 _PyAccu_Accumulate(&acc, zero) ||
13468 _PyAccu_Accumulate(&acc, second))
13469 goto onError;
13470 Py_CLEAR(second);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013471 }
13472 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013473 /* Copy all characters, preserving len */
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013474 if (temp != NULL) {
13475 assert(pbuf == PyUnicode_DATA(temp));
13476 v = PyUnicode_Substring(temp, pindex, pindex + len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013477 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013478 else {
13479 const char *p = (const char *) pbuf;
13480 assert(pbuf != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013481 p += kind * pindex;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013482 v = PyUnicode_FromKindAndData(kind, p, len);
13483 }
13484 if (v == NULL)
13485 goto onError;
13486 r = _PyAccu_Accumulate(&acc, v);
13487 Py_DECREF(v);
13488 if (r)
13489 goto onError;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013490 if (width > len && repeat_accumulate(&acc, blank, width - len))
13491 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013492 if (dict && (argidx < arglen) && c != '%') {
13493 PyErr_SetString(PyExc_TypeError,
13494 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013495 goto onError;
13496 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013497 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013498 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013499 } /* until end */
13500 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013501 PyErr_SetString(PyExc_TypeError,
13502 "not all arguments converted during string formatting");
13503 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013504 }
13505
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013506 result = _PyAccu_Finish(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013507 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013508 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013509 }
13510 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013511 Py_XDECREF(temp);
13512 Py_XDECREF(second);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013513 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013514
Benjamin Peterson29060642009-01-31 22:14:21 +000013515 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013516 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013517 Py_XDECREF(temp);
13518 Py_XDECREF(second);
13519 _PyAccu_Destroy(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013520 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013521 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013522 }
13523 return NULL;
13524}
13525
Jeremy Hylton938ace62002-07-17 16:30:39 +000013526static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013527unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13528
Tim Peters6d6c1a32001-08-02 04:15:00 +000013529static PyObject *
13530unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13531{
Benjamin Peterson29060642009-01-31 22:14:21 +000013532 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013533 static char *kwlist[] = {"object", "encoding", "errors", 0};
13534 char *encoding = NULL;
13535 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013536
Benjamin Peterson14339b62009-01-31 16:36:08 +000013537 if (type != &PyUnicode_Type)
13538 return unicode_subtype_new(type, args, kwds);
13539 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013540 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013541 return NULL;
Victor Stinner382955f2011-12-11 21:44:00 +010013542 if (x == NULL) {
13543 Py_INCREF(unicode_empty);
13544 return unicode_empty;
13545 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013546 if (encoding == NULL && errors == NULL)
13547 return PyObject_Str(x);
13548 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013549 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013550}
13551
Guido van Rossume023fe02001-08-30 03:12:59 +000013552static PyObject *
13553unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13554{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013555 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013556 Py_ssize_t length, char_size;
13557 int share_wstr, share_utf8;
13558 unsigned int kind;
13559 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013560
Benjamin Peterson14339b62009-01-31 16:36:08 +000013561 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013562
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013563 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013564 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013565 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013566 assert(_PyUnicode_CHECK(unicode));
Victor Stinnere06e1452011-10-04 20:52:31 +020013567 if (PyUnicode_READY(unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013568 return NULL;
13569
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013570 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013571 if (self == NULL) {
13572 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013573 return NULL;
13574 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013575 kind = PyUnicode_KIND(unicode);
13576 length = PyUnicode_GET_LENGTH(unicode);
13577
13578 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013579#ifdef Py_DEBUG
13580 _PyUnicode_HASH(self) = -1;
13581#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013582 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013583#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013584 _PyUnicode_STATE(self).interned = 0;
13585 _PyUnicode_STATE(self).kind = kind;
13586 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013587 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013588 _PyUnicode_STATE(self).ready = 1;
13589 _PyUnicode_WSTR(self) = NULL;
13590 _PyUnicode_UTF8_LENGTH(self) = 0;
13591 _PyUnicode_UTF8(self) = NULL;
13592 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013593 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013594
13595 share_utf8 = 0;
13596 share_wstr = 0;
13597 if (kind == PyUnicode_1BYTE_KIND) {
13598 char_size = 1;
13599 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13600 share_utf8 = 1;
13601 }
13602 else if (kind == PyUnicode_2BYTE_KIND) {
13603 char_size = 2;
13604 if (sizeof(wchar_t) == 2)
13605 share_wstr = 1;
13606 }
13607 else {
13608 assert(kind == PyUnicode_4BYTE_KIND);
13609 char_size = 4;
13610 if (sizeof(wchar_t) == 4)
13611 share_wstr = 1;
13612 }
13613
13614 /* Ensure we won't overflow the length. */
13615 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13616 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013617 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013618 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013619 data = PyObject_MALLOC((length + 1) * char_size);
13620 if (data == NULL) {
13621 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013622 goto onError;
13623 }
13624
Victor Stinnerc3c74152011-10-02 20:39:55 +020013625 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013626 if (share_utf8) {
13627 _PyUnicode_UTF8_LENGTH(self) = length;
13628 _PyUnicode_UTF8(self) = data;
13629 }
13630 if (share_wstr) {
13631 _PyUnicode_WSTR_LENGTH(self) = length;
13632 _PyUnicode_WSTR(self) = (wchar_t *)data;
13633 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013634
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013635 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013636 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013637 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013638#ifdef Py_DEBUG
13639 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13640#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020013641 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013642 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013643
13644onError:
13645 Py_DECREF(unicode);
13646 Py_DECREF(self);
13647 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013648}
13649
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013650PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013651 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013652\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013653Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013654encoding defaults to the current default string encoding.\n\
13655errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013656
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013657static PyObject *unicode_iter(PyObject *seq);
13658
Guido van Rossumd57fd912000-03-10 22:53:23 +000013659PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013660 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013661 "str", /* tp_name */
13662 sizeof(PyUnicodeObject), /* tp_size */
13663 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013664 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013665 (destructor)unicode_dealloc, /* tp_dealloc */
13666 0, /* tp_print */
13667 0, /* tp_getattr */
13668 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013669 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013670 unicode_repr, /* tp_repr */
13671 &unicode_as_number, /* tp_as_number */
13672 &unicode_as_sequence, /* tp_as_sequence */
13673 &unicode_as_mapping, /* tp_as_mapping */
13674 (hashfunc) unicode_hash, /* tp_hash*/
13675 0, /* tp_call*/
13676 (reprfunc) unicode_str, /* tp_str */
13677 PyObject_GenericGetAttr, /* tp_getattro */
13678 0, /* tp_setattro */
13679 0, /* tp_as_buffer */
13680 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013681 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013682 unicode_doc, /* tp_doc */
13683 0, /* tp_traverse */
13684 0, /* tp_clear */
13685 PyUnicode_RichCompare, /* tp_richcompare */
13686 0, /* tp_weaklistoffset */
13687 unicode_iter, /* tp_iter */
13688 0, /* tp_iternext */
13689 unicode_methods, /* tp_methods */
13690 0, /* tp_members */
13691 0, /* tp_getset */
13692 &PyBaseObject_Type, /* tp_base */
13693 0, /* tp_dict */
13694 0, /* tp_descr_get */
13695 0, /* tp_descr_set */
13696 0, /* tp_dictoffset */
13697 0, /* tp_init */
13698 0, /* tp_alloc */
13699 unicode_new, /* tp_new */
13700 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013701};
13702
13703/* Initialize the Unicode implementation */
13704
Victor Stinner3a50e702011-10-18 21:21:00 +020013705int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013706{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013707 int i;
13708
Thomas Wouters477c8d52006-05-27 19:21:47 +000013709 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013710 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013711 0x000A, /* LINE FEED */
13712 0x000D, /* CARRIAGE RETURN */
13713 0x001C, /* FILE SEPARATOR */
13714 0x001D, /* GROUP SEPARATOR */
13715 0x001E, /* RECORD SEPARATOR */
13716 0x0085, /* NEXT LINE */
13717 0x2028, /* LINE SEPARATOR */
13718 0x2029, /* PARAGRAPH SEPARATOR */
13719 };
13720
Fred Drakee4315f52000-05-09 19:53:39 +000013721 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013722 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013723 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013724 Py_FatalError("Can't create empty string");
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010013725 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013726
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013727 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013728 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013729 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013730 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013731
13732 /* initialize the linebreak bloom filter */
13733 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013734 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013735 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013736
13737 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020013738
13739#ifdef HAVE_MBCS
13740 winver.dwOSVersionInfoSize = sizeof(winver);
13741 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
13742 PyErr_SetFromWindowsErr(0);
13743 return -1;
13744 }
13745#endif
13746 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013747}
13748
13749/* Finalize the Unicode implementation */
13750
Christian Heimesa156e092008-02-16 07:38:31 +000013751int
13752PyUnicode_ClearFreeList(void)
13753{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013754 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013755}
13756
Guido van Rossumd57fd912000-03-10 22:53:23 +000013757void
Thomas Wouters78890102000-07-22 19:25:51 +000013758_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013759{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013760 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013761
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013762 Py_XDECREF(unicode_empty);
13763 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013764
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013765 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013766 if (unicode_latin1[i]) {
13767 Py_DECREF(unicode_latin1[i]);
13768 unicode_latin1[i] = NULL;
13769 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013770 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020013771 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000013772 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013773}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013774
Walter Dörwald16807132007-05-25 13:52:07 +000013775void
13776PyUnicode_InternInPlace(PyObject **p)
13777{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013778 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013779 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013780#ifdef Py_DEBUG
13781 assert(s != NULL);
13782 assert(_PyUnicode_CHECK(s));
13783#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013784 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013785 return;
13786#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013787 /* If it's a subclass, we don't really know what putting
13788 it in the interned dict might do. */
13789 if (!PyUnicode_CheckExact(s))
13790 return;
13791 if (PyUnicode_CHECK_INTERNED(s))
13792 return;
13793 if (interned == NULL) {
13794 interned = PyDict_New();
13795 if (interned == NULL) {
13796 PyErr_Clear(); /* Don't leave an exception */
13797 return;
13798 }
13799 }
13800 /* It might be that the GetItem call fails even
13801 though the key is present in the dictionary,
13802 namely when this happens during a stack overflow. */
13803 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010013804 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013805 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000013806
Benjamin Peterson29060642009-01-31 22:14:21 +000013807 if (t) {
13808 Py_INCREF(t);
13809 Py_DECREF(*p);
13810 *p = t;
13811 return;
13812 }
Walter Dörwald16807132007-05-25 13:52:07 +000013813
Benjamin Peterson14339b62009-01-31 16:36:08 +000013814 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010013815 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013816 PyErr_Clear();
13817 PyThreadState_GET()->recursion_critical = 0;
13818 return;
13819 }
13820 PyThreadState_GET()->recursion_critical = 0;
13821 /* The two references in interned are not counted by refcnt.
13822 The deallocator will take care of this */
13823 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013824 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013825}
13826
13827void
13828PyUnicode_InternImmortal(PyObject **p)
13829{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013830 PyUnicode_InternInPlace(p);
13831 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020013832 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013833 Py_INCREF(*p);
13834 }
Walter Dörwald16807132007-05-25 13:52:07 +000013835}
13836
13837PyObject *
13838PyUnicode_InternFromString(const char *cp)
13839{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013840 PyObject *s = PyUnicode_FromString(cp);
13841 if (s == NULL)
13842 return NULL;
13843 PyUnicode_InternInPlace(&s);
13844 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000013845}
13846
Alexander Belopolsky40018472011-02-26 01:02:56 +000013847void
13848_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000013849{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013850 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013851 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013852 Py_ssize_t i, n;
13853 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000013854
Benjamin Peterson14339b62009-01-31 16:36:08 +000013855 if (interned == NULL || !PyDict_Check(interned))
13856 return;
13857 keys = PyDict_Keys(interned);
13858 if (keys == NULL || !PyList_Check(keys)) {
13859 PyErr_Clear();
13860 return;
13861 }
Walter Dörwald16807132007-05-25 13:52:07 +000013862
Benjamin Peterson14339b62009-01-31 16:36:08 +000013863 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13864 detector, interned unicode strings are not forcibly deallocated;
13865 rather, we give them their stolen references back, and then clear
13866 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000013867
Benjamin Peterson14339b62009-01-31 16:36:08 +000013868 n = PyList_GET_SIZE(keys);
13869 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000013870 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013871 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013872 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013873 if (PyUnicode_READY(s) == -1) {
13874 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013875 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013876 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013877 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013878 case SSTATE_NOT_INTERNED:
13879 /* XXX Shouldn't happen */
13880 break;
13881 case SSTATE_INTERNED_IMMORTAL:
13882 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013883 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013884 break;
13885 case SSTATE_INTERNED_MORTAL:
13886 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013887 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013888 break;
13889 default:
13890 Py_FatalError("Inconsistent interned string state.");
13891 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013892 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013893 }
13894 fprintf(stderr, "total size of all interned strings: "
13895 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13896 "mortal/immortal\n", mortal_size, immortal_size);
13897 Py_DECREF(keys);
13898 PyDict_Clear(interned);
13899 Py_DECREF(interned);
13900 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000013901}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013902
13903
13904/********************* Unicode Iterator **************************/
13905
13906typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013907 PyObject_HEAD
13908 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013909 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013910} unicodeiterobject;
13911
13912static void
13913unicodeiter_dealloc(unicodeiterobject *it)
13914{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013915 _PyObject_GC_UNTRACK(it);
13916 Py_XDECREF(it->it_seq);
13917 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013918}
13919
13920static int
13921unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13922{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013923 Py_VISIT(it->it_seq);
13924 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013925}
13926
13927static PyObject *
13928unicodeiter_next(unicodeiterobject *it)
13929{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013930 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013931
Benjamin Peterson14339b62009-01-31 16:36:08 +000013932 assert(it != NULL);
13933 seq = it->it_seq;
13934 if (seq == NULL)
13935 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013936 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013937
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013938 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
13939 int kind = PyUnicode_KIND(seq);
13940 void *data = PyUnicode_DATA(seq);
13941 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
13942 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013943 if (item != NULL)
13944 ++it->it_index;
13945 return item;
13946 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013947
Benjamin Peterson14339b62009-01-31 16:36:08 +000013948 Py_DECREF(seq);
13949 it->it_seq = NULL;
13950 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013951}
13952
13953static PyObject *
13954unicodeiter_len(unicodeiterobject *it)
13955{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013956 Py_ssize_t len = 0;
13957 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013958 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013959 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013960}
13961
13962PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
13963
13964static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013965 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000013966 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000013967 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013968};
13969
13970PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013971 PyVarObject_HEAD_INIT(&PyType_Type, 0)
13972 "str_iterator", /* tp_name */
13973 sizeof(unicodeiterobject), /* tp_basicsize */
13974 0, /* tp_itemsize */
13975 /* methods */
13976 (destructor)unicodeiter_dealloc, /* tp_dealloc */
13977 0, /* tp_print */
13978 0, /* tp_getattr */
13979 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013980 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013981 0, /* tp_repr */
13982 0, /* tp_as_number */
13983 0, /* tp_as_sequence */
13984 0, /* tp_as_mapping */
13985 0, /* tp_hash */
13986 0, /* tp_call */
13987 0, /* tp_str */
13988 PyObject_GenericGetAttr, /* tp_getattro */
13989 0, /* tp_setattro */
13990 0, /* tp_as_buffer */
13991 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
13992 0, /* tp_doc */
13993 (traverseproc)unicodeiter_traverse, /* tp_traverse */
13994 0, /* tp_clear */
13995 0, /* tp_richcompare */
13996 0, /* tp_weaklistoffset */
13997 PyObject_SelfIter, /* tp_iter */
13998 (iternextfunc)unicodeiter_next, /* tp_iternext */
13999 unicodeiter_methods, /* tp_methods */
14000 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014001};
14002
14003static PyObject *
14004unicode_iter(PyObject *seq)
14005{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014006 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014007
Benjamin Peterson14339b62009-01-31 16:36:08 +000014008 if (!PyUnicode_Check(seq)) {
14009 PyErr_BadInternalCall();
14010 return NULL;
14011 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014012 if (PyUnicode_READY(seq) == -1)
14013 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014014 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14015 if (it == NULL)
14016 return NULL;
14017 it->it_index = 0;
14018 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014019 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014020 _PyObject_GC_TRACK(it);
14021 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014022}
14023
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014024
14025size_t
14026Py_UNICODE_strlen(const Py_UNICODE *u)
14027{
14028 int res = 0;
14029 while(*u++)
14030 res++;
14031 return res;
14032}
14033
14034Py_UNICODE*
14035Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14036{
14037 Py_UNICODE *u = s1;
14038 while ((*u++ = *s2++));
14039 return s1;
14040}
14041
14042Py_UNICODE*
14043Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14044{
14045 Py_UNICODE *u = s1;
14046 while ((*u++ = *s2++))
14047 if (n-- == 0)
14048 break;
14049 return s1;
14050}
14051
14052Py_UNICODE*
14053Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14054{
14055 Py_UNICODE *u1 = s1;
14056 u1 += Py_UNICODE_strlen(u1);
14057 Py_UNICODE_strcpy(u1, s2);
14058 return s1;
14059}
14060
14061int
14062Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14063{
14064 while (*s1 && *s2 && *s1 == *s2)
14065 s1++, s2++;
14066 if (*s1 && *s2)
14067 return (*s1 < *s2) ? -1 : +1;
14068 if (*s1)
14069 return 1;
14070 if (*s2)
14071 return -1;
14072 return 0;
14073}
14074
14075int
14076Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14077{
14078 register Py_UNICODE u1, u2;
14079 for (; n != 0; n--) {
14080 u1 = *s1;
14081 u2 = *s2;
14082 if (u1 != u2)
14083 return (u1 < u2) ? -1 : +1;
14084 if (u1 == '\0')
14085 return 0;
14086 s1++;
14087 s2++;
14088 }
14089 return 0;
14090}
14091
14092Py_UNICODE*
14093Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14094{
14095 const Py_UNICODE *p;
14096 for (p = s; *p; p++)
14097 if (*p == c)
14098 return (Py_UNICODE*)p;
14099 return NULL;
14100}
14101
14102Py_UNICODE*
14103Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14104{
14105 const Py_UNICODE *p;
14106 p = s + Py_UNICODE_strlen(s);
14107 while (p != s) {
14108 p--;
14109 if (*p == c)
14110 return (Py_UNICODE*)p;
14111 }
14112 return NULL;
14113}
Victor Stinner331ea922010-08-10 16:37:20 +000014114
Victor Stinner71133ff2010-09-01 23:43:53 +000014115Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014116PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014117{
Victor Stinner577db2c2011-10-11 22:12:48 +020014118 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014119 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014120
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014121 if (!PyUnicode_Check(unicode)) {
14122 PyErr_BadArgument();
14123 return NULL;
14124 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014125 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014126 if (u == NULL)
14127 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014128 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014129 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014130 PyErr_NoMemory();
14131 return NULL;
14132 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014133 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014134 size *= sizeof(Py_UNICODE);
14135 copy = PyMem_Malloc(size);
14136 if (copy == NULL) {
14137 PyErr_NoMemory();
14138 return NULL;
14139 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014140 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014141 return copy;
14142}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014143
Georg Brandl66c221e2010-10-14 07:04:07 +000014144/* A _string module, to export formatter_parser and formatter_field_name_split
14145 to the string.Formatter class implemented in Python. */
14146
14147static PyMethodDef _string_methods[] = {
14148 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14149 METH_O, PyDoc_STR("split the argument as a field name")},
14150 {"formatter_parser", (PyCFunction) formatter_parser,
14151 METH_O, PyDoc_STR("parse the argument as a format string")},
14152 {NULL, NULL}
14153};
14154
14155static struct PyModuleDef _string_module = {
14156 PyModuleDef_HEAD_INIT,
14157 "_string",
14158 PyDoc_STR("string helper module"),
14159 0,
14160 _string_methods,
14161 NULL,
14162 NULL,
14163 NULL,
14164 NULL
14165};
14166
14167PyMODINIT_FUNC
14168PyInit__string(void)
14169{
14170 return PyModule_Create(&_string_module);
14171}
14172
14173
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014174#ifdef __cplusplus
14175}
14176#endif