blob: c974ffe5ded716ad6494a257c5a17cf2a49d0a6b [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Endianness switches; defaults to little endian */
51
52#ifdef WORDS_BIGENDIAN
53# define BYTEORDER_IS_BIG_ENDIAN
54#else
55# define BYTEORDER_IS_LITTLE_ENDIAN
56#endif
57
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000058/* --- Globals ------------------------------------------------------------
59
60 The globals are initialized by the _PyUnicode_Init() API and should
61 not be used before calling that API.
62
63*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000064
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000065
66#ifdef __cplusplus
67extern "C" {
68#endif
69
Victor Stinner8faf8212011-12-08 22:14:11 +010070/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
71#define MAX_UNICODE 0x10ffff
72
Victor Stinner910337b2011-10-03 03:20:16 +020073#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020074# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020075#else
76# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
77#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020078
Victor Stinnere90fe6a2011-10-01 16:48:13 +020079#define _PyUnicode_UTF8(op) \
80 (((PyCompactUnicodeObject*)(op))->utf8)
81#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020082 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020083 assert(PyUnicode_IS_READY(op)), \
84 PyUnicode_IS_COMPACT_ASCII(op) ? \
85 ((char*)((PyASCIIObject*)(op) + 1)) : \
86 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020087#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020088 (((PyCompactUnicodeObject*)(op))->utf8_length)
89#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020090 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020091 assert(PyUnicode_IS_READY(op)), \
92 PyUnicode_IS_COMPACT_ASCII(op) ? \
93 ((PyASCIIObject*)(op))->length : \
94 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020095#define _PyUnicode_WSTR(op) \
96 (((PyASCIIObject*)(op))->wstr)
97#define _PyUnicode_WSTR_LENGTH(op) \
98 (((PyCompactUnicodeObject*)(op))->wstr_length)
99#define _PyUnicode_LENGTH(op) \
100 (((PyASCIIObject *)(op))->length)
101#define _PyUnicode_STATE(op) \
102 (((PyASCIIObject *)(op))->state)
103#define _PyUnicode_HASH(op) \
104 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200105#define _PyUnicode_KIND(op) \
106 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200107 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200108#define _PyUnicode_GET_LENGTH(op) \
109 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200110 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200111#define _PyUnicode_DATA_ANY(op) \
112 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200113
Victor Stinnere6abb482012-05-02 01:15:40 +0200114/* Optimized version of Py_MAX() to compute the maximum character:
115 use it when your are computing the second argument of PyUnicode_New() */
116#define MAX_MAXCHAR(maxchar1, maxchar2) \
117 ((maxchar1) | (maxchar2))
118
Victor Stinner910337b2011-10-03 03:20:16 +0200119#undef PyUnicode_READY
120#define PyUnicode_READY(op) \
121 (assert(_PyUnicode_CHECK(op)), \
122 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200123 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100124 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200125
Victor Stinnerc379ead2011-10-03 12:52:27 +0200126#define _PyUnicode_SHARE_UTF8(op) \
127 (assert(_PyUnicode_CHECK(op)), \
128 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
129 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
130#define _PyUnicode_SHARE_WSTR(op) \
131 (assert(_PyUnicode_CHECK(op)), \
132 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
133
Victor Stinner829c0ad2011-10-03 01:08:02 +0200134/* true if the Unicode object has an allocated UTF-8 memory block
135 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200136#define _PyUnicode_HAS_UTF8_MEMORY(op) \
137 (assert(_PyUnicode_CHECK(op)), \
138 (!PyUnicode_IS_COMPACT_ASCII(op) \
139 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200140 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
141
Victor Stinner03490912011-10-03 23:45:12 +0200142/* true if the Unicode object has an allocated wstr memory block
143 (not shared with other data) */
144#define _PyUnicode_HAS_WSTR_MEMORY(op) \
145 (assert(_PyUnicode_CHECK(op)), \
146 (_PyUnicode_WSTR(op) && \
147 (!PyUnicode_IS_READY(op) || \
148 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
149
Victor Stinner910337b2011-10-03 03:20:16 +0200150/* Generic helper macro to convert characters of different types.
151 from_type and to_type have to be valid type names, begin and end
152 are pointers to the source characters which should be of type
153 "from_type *". to is a pointer of type "to_type *" and points to the
154 buffer where the result characters are written to. */
155#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
156 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200157 to_type *_to = (to_type *) to; \
158 const from_type *_iter = (begin); \
159 const from_type *_end = (end); \
160 Py_ssize_t n = (_end) - (_iter); \
161 const from_type *_unrolled_end = \
162 _iter + (n & ~ (Py_ssize_t) 3); \
163 while (_iter < (_unrolled_end)) { \
164 _to[0] = (to_type) _iter[0]; \
165 _to[1] = (to_type) _iter[1]; \
166 _to[2] = (to_type) _iter[2]; \
167 _to[3] = (to_type) _iter[3]; \
168 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200169 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200170 while (_iter < (_end)) \
171 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200172 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200173
Walter Dörwald16807132007-05-25 13:52:07 +0000174/* This dictionary holds all interned unicode strings. Note that references
175 to strings in this dictionary are *not* counted in the string's ob_refcnt.
176 When the interned string reaches a refcnt of 0 the string deallocation
177 function will delete the reference from this dictionary.
178
179 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000180 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000181*/
182static PyObject *interned;
183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200185static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000186
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200187/* List of static strings. */
188static _Py_Identifier *static_strings;
189
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190/* Single character Unicode strings in the Latin-1 range are being
191 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200192static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193
Christian Heimes190d79e2008-01-30 11:58:22 +0000194/* Fast detection of the most frequent whitespace characters */
195const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000196 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000197/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000198/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000199/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000200/* case 0x000C: * FORM FEED */
201/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000202 0, 1, 1, 1, 1, 1, 0, 0,
203 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000204/* case 0x001C: * FILE SEPARATOR */
205/* case 0x001D: * GROUP SEPARATOR */
206/* case 0x001E: * RECORD SEPARATOR */
207/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000208 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000209/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 1, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000214
Benjamin Peterson14339b62009-01-31 16:36:08 +0000215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
218 0, 0, 0, 0, 0, 0, 0, 0,
219 0, 0, 0, 0, 0, 0, 0, 0,
220 0, 0, 0, 0, 0, 0, 0, 0,
221 0, 0, 0, 0, 0, 0, 0, 0,
222 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000223};
224
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200225/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200226static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200227static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100228static int unicode_modifiable(PyObject *unicode);
229
Victor Stinnerfe226c02011-10-03 03:52:20 +0200230
Alexander Belopolsky40018472011-02-26 01:02:56 +0000231static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200232_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
233static PyObject *
234_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
235static PyObject *
236_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
237
238static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000239unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000240 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100241 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000242 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
243
Alexander Belopolsky40018472011-02-26 01:02:56 +0000244static void
245raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300246 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100247 PyObject *unicode,
248 Py_ssize_t startpos, Py_ssize_t endpos,
249 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000250
Christian Heimes190d79e2008-01-30 11:58:22 +0000251/* Same for linebreaks */
252static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000253 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000254/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000255/* 0x000B, * LINE TABULATION */
256/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000257/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000258 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000259 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000260/* 0x001C, * FILE SEPARATOR */
261/* 0x001D, * GROUP SEPARATOR */
262/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000263 0, 0, 0, 0, 1, 1, 1, 0,
264 0, 0, 0, 0, 0, 0, 0, 0,
265 0, 0, 0, 0, 0, 0, 0, 0,
266 0, 0, 0, 0, 0, 0, 0, 0,
267 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000268
Benjamin Peterson14339b62009-01-31 16:36:08 +0000269 0, 0, 0, 0, 0, 0, 0, 0,
270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000277};
278
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300279/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
280 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000281Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000282PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000283{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000284#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000285 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000286#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000287 /* This is actually an illegal character, so it should
288 not be passed to unichr. */
289 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000290#endif
291}
292
Victor Stinner910337b2011-10-03 03:20:16 +0200293#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200294int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100295_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200296{
297 PyASCIIObject *ascii;
298 unsigned int kind;
299
300 assert(PyUnicode_Check(op));
301
302 ascii = (PyASCIIObject *)op;
303 kind = ascii->state.kind;
304
Victor Stinnera3b334d2011-10-03 13:53:37 +0200305 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200306 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200307 assert(ascii->state.ready == 1);
308 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200309 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200310 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200311 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200312
Victor Stinnera41463c2011-10-04 01:05:08 +0200313 if (ascii->state.compact == 1) {
314 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200315 assert(kind == PyUnicode_1BYTE_KIND
316 || kind == PyUnicode_2BYTE_KIND
317 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200318 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200319 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200320 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100321 }
322 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200323 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
324
325 data = unicode->data.any;
326 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100327 assert(ascii->length == 0);
328 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200329 assert(ascii->state.compact == 0);
330 assert(ascii->state.ascii == 0);
331 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100332 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200333 assert(ascii->wstr != NULL);
334 assert(data == NULL);
335 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200336 }
337 else {
338 assert(kind == PyUnicode_1BYTE_KIND
339 || kind == PyUnicode_2BYTE_KIND
340 || kind == PyUnicode_4BYTE_KIND);
341 assert(ascii->state.compact == 0);
342 assert(ascii->state.ready == 1);
343 assert(data != NULL);
344 if (ascii->state.ascii) {
345 assert (compact->utf8 == data);
346 assert (compact->utf8_length == ascii->length);
347 }
348 else
349 assert (compact->utf8 != data);
350 }
351 }
352 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200353 if (
354#if SIZEOF_WCHAR_T == 2
355 kind == PyUnicode_2BYTE_KIND
356#else
357 kind == PyUnicode_4BYTE_KIND
358#endif
359 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200360 {
361 assert(ascii->wstr == data);
362 assert(compact->wstr_length == ascii->length);
363 } else
364 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200365 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200366
367 if (compact->utf8 == NULL)
368 assert(compact->utf8_length == 0);
369 if (ascii->wstr == NULL)
370 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200371 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200372 /* check that the best kind is used */
373 if (check_content && kind != PyUnicode_WCHAR_KIND)
374 {
375 Py_ssize_t i;
376 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200377 void *data;
378 Py_UCS4 ch;
379
380 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200381 for (i=0; i < ascii->length; i++)
382 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200383 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200384 if (ch > maxchar)
385 maxchar = ch;
386 }
387 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100388 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200389 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100390 assert(maxchar <= 255);
391 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200392 else
393 assert(maxchar < 128);
394 }
Victor Stinner77faf692011-11-20 18:56:05 +0100395 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200396 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100397 assert(maxchar <= 0xFFFF);
398 }
399 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200400 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100401 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100402 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200403 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200404 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400405 return 1;
406}
Victor Stinner910337b2011-10-03 03:20:16 +0200407#endif
408
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100409static PyObject*
410unicode_result_wchar(PyObject *unicode)
411{
412#ifndef Py_DEBUG
413 Py_ssize_t len;
414
415 assert(Py_REFCNT(unicode) == 1);
416
417 len = _PyUnicode_WSTR_LENGTH(unicode);
418 if (len == 0) {
419 Py_INCREF(unicode_empty);
420 Py_DECREF(unicode);
421 return unicode_empty;
422 }
423
424 if (len == 1) {
425 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
426 if (ch < 256) {
427 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
428 Py_DECREF(unicode);
429 return latin1_char;
430 }
431 }
432
433 if (_PyUnicode_Ready(unicode) < 0) {
434 Py_XDECREF(unicode);
435 return NULL;
436 }
437#else
438 /* don't make the result ready in debug mode to ensure that the caller
439 makes the string ready before using it */
440 assert(_PyUnicode_CheckConsistency(unicode, 1));
441#endif
442 return unicode;
443}
444
445static PyObject*
446unicode_result_ready(PyObject *unicode)
447{
448 Py_ssize_t length;
449
450 length = PyUnicode_GET_LENGTH(unicode);
451 if (length == 0) {
452 if (unicode != unicode_empty) {
453 Py_INCREF(unicode_empty);
454 Py_DECREF(unicode);
455 }
456 return unicode_empty;
457 }
458
459 if (length == 1) {
460 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
461 if (ch < 256) {
462 PyObject *latin1_char = unicode_latin1[ch];
463 if (latin1_char != NULL) {
464 if (unicode != latin1_char) {
465 Py_INCREF(latin1_char);
466 Py_DECREF(unicode);
467 }
468 return latin1_char;
469 }
470 else {
471 assert(_PyUnicode_CheckConsistency(unicode, 1));
472 Py_INCREF(unicode);
473 unicode_latin1[ch] = unicode;
474 return unicode;
475 }
476 }
477 }
478
479 assert(_PyUnicode_CheckConsistency(unicode, 1));
480 return unicode;
481}
482
483static PyObject*
484unicode_result(PyObject *unicode)
485{
486 assert(_PyUnicode_CHECK(unicode));
487 if (PyUnicode_IS_READY(unicode))
488 return unicode_result_ready(unicode);
489 else
490 return unicode_result_wchar(unicode);
491}
492
Victor Stinnerc4b49542011-12-11 22:44:26 +0100493static PyObject*
494unicode_result_unchanged(PyObject *unicode)
495{
496 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500497 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100498 return NULL;
499 Py_INCREF(unicode);
500 return unicode;
501 }
502 else
503 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100504 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100505}
506
Victor Stinner3a50e702011-10-18 21:21:00 +0200507#ifdef HAVE_MBCS
508static OSVERSIONINFOEX winver;
509#endif
510
Thomas Wouters477c8d52006-05-27 19:21:47 +0000511/* --- Bloom Filters ----------------------------------------------------- */
512
513/* stuff to implement simple "bloom filters" for Unicode characters.
514 to keep things simple, we use a single bitmask, using the least 5
515 bits from each unicode characters as the bit index. */
516
517/* the linebreak mask is set up by Unicode_Init below */
518
Antoine Pitrouf068f942010-01-13 14:19:12 +0000519#if LONG_BIT >= 128
520#define BLOOM_WIDTH 128
521#elif LONG_BIT >= 64
522#define BLOOM_WIDTH 64
523#elif LONG_BIT >= 32
524#define BLOOM_WIDTH 32
525#else
526#error "LONG_BIT is smaller than 32"
527#endif
528
Thomas Wouters477c8d52006-05-27 19:21:47 +0000529#define BLOOM_MASK unsigned long
530
531static BLOOM_MASK bloom_linebreak;
532
Antoine Pitrouf068f942010-01-13 14:19:12 +0000533#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
534#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000535
Benjamin Peterson29060642009-01-31 22:14:21 +0000536#define BLOOM_LINEBREAK(ch) \
537 ((ch) < 128U ? ascii_linebreak[(ch)] : \
538 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000539
Alexander Belopolsky40018472011-02-26 01:02:56 +0000540Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200541make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000542{
543 /* calculate simple bloom-style bitmask for a given unicode string */
544
Antoine Pitrouf068f942010-01-13 14:19:12 +0000545 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000546 Py_ssize_t i;
547
548 mask = 0;
549 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200550 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000551
552 return mask;
553}
554
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200555#define BLOOM_MEMBER(mask, chr, str) \
556 (BLOOM(mask, chr) \
557 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000558
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200559/* Compilation of templated routines */
560
561#include "stringlib/asciilib.h"
562#include "stringlib/fastsearch.h"
563#include "stringlib/partition.h"
564#include "stringlib/split.h"
565#include "stringlib/count.h"
566#include "stringlib/find.h"
567#include "stringlib/find_max_char.h"
568#include "stringlib/localeutil.h"
569#include "stringlib/undef.h"
570
571#include "stringlib/ucs1lib.h"
572#include "stringlib/fastsearch.h"
573#include "stringlib/partition.h"
574#include "stringlib/split.h"
575#include "stringlib/count.h"
576#include "stringlib/find.h"
577#include "stringlib/find_max_char.h"
578#include "stringlib/localeutil.h"
579#include "stringlib/undef.h"
580
581#include "stringlib/ucs2lib.h"
582#include "stringlib/fastsearch.h"
583#include "stringlib/partition.h"
584#include "stringlib/split.h"
585#include "stringlib/count.h"
586#include "stringlib/find.h"
587#include "stringlib/find_max_char.h"
588#include "stringlib/localeutil.h"
589#include "stringlib/undef.h"
590
591#include "stringlib/ucs4lib.h"
592#include "stringlib/fastsearch.h"
593#include "stringlib/partition.h"
594#include "stringlib/split.h"
595#include "stringlib/count.h"
596#include "stringlib/find.h"
597#include "stringlib/find_max_char.h"
598#include "stringlib/localeutil.h"
599#include "stringlib/undef.h"
600
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200601#include "stringlib/unicodedefs.h"
602#include "stringlib/fastsearch.h"
603#include "stringlib/count.h"
604#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100605#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200606
Guido van Rossumd57fd912000-03-10 22:53:23 +0000607/* --- Unicode Object ----------------------------------------------------- */
608
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200609static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200610fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200611
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200612Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
613 Py_ssize_t size, Py_UCS4 ch,
614 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200615{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200616 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
617
618 switch (kind) {
619 case PyUnicode_1BYTE_KIND:
620 {
621 Py_UCS1 ch1 = (Py_UCS1) ch;
622 if (ch1 == ch)
623 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
624 else
625 return -1;
626 }
627 case PyUnicode_2BYTE_KIND:
628 {
629 Py_UCS2 ch2 = (Py_UCS2) ch;
630 if (ch2 == ch)
631 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
632 else
633 return -1;
634 }
635 case PyUnicode_4BYTE_KIND:
636 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
637 default:
638 assert(0);
639 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200640 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200641}
642
Victor Stinnerfe226c02011-10-03 03:52:20 +0200643static PyObject*
644resize_compact(PyObject *unicode, Py_ssize_t length)
645{
646 Py_ssize_t char_size;
647 Py_ssize_t struct_size;
648 Py_ssize_t new_size;
649 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100650 PyObject *new_unicode;
Victor Stinner79891572012-05-03 13:43:07 +0200651 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200652 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100653 assert(PyUnicode_IS_COMPACT(unicode));
654
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200655 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100656 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200657 struct_size = sizeof(PyASCIIObject);
658 else
659 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200660 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200661
Victor Stinnerfe226c02011-10-03 03:52:20 +0200662 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
663 PyErr_NoMemory();
664 return NULL;
665 }
666 new_size = (struct_size + (length + 1) * char_size);
667
Victor Stinner84def372011-12-11 20:04:56 +0100668 _Py_DEC_REFTOTAL;
669 _Py_ForgetReference(unicode);
670
671 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
672 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100673 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200674 PyErr_NoMemory();
675 return NULL;
676 }
Victor Stinner84def372011-12-11 20:04:56 +0100677 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200678 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100679
Victor Stinnerfe226c02011-10-03 03:52:20 +0200680 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200681 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200682 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100683 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200684 _PyUnicode_WSTR_LENGTH(unicode) = length;
685 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200686 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
687 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200688 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200689 return unicode;
690}
691
Alexander Belopolsky40018472011-02-26 01:02:56 +0000692static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200693resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000694{
Victor Stinner95663112011-10-04 01:03:50 +0200695 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100696 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200697 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200698 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000699
Victor Stinnerfe226c02011-10-03 03:52:20 +0200700 if (PyUnicode_IS_READY(unicode)) {
701 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200702 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200703 void *data;
704
705 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200706 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200707 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
708 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200709
710 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
711 PyErr_NoMemory();
712 return -1;
713 }
714 new_size = (length + 1) * char_size;
715
Victor Stinner7a9105a2011-12-12 00:13:42 +0100716 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
717 {
718 PyObject_DEL(_PyUnicode_UTF8(unicode));
719 _PyUnicode_UTF8(unicode) = NULL;
720 _PyUnicode_UTF8_LENGTH(unicode) = 0;
721 }
722
Victor Stinnerfe226c02011-10-03 03:52:20 +0200723 data = (PyObject *)PyObject_REALLOC(data, new_size);
724 if (data == NULL) {
725 PyErr_NoMemory();
726 return -1;
727 }
728 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200729 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200730 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200731 _PyUnicode_WSTR_LENGTH(unicode) = length;
732 }
733 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200734 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200735 _PyUnicode_UTF8_LENGTH(unicode) = length;
736 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200737 _PyUnicode_LENGTH(unicode) = length;
738 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200739 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200740 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200741 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200742 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200743 }
Victor Stinner95663112011-10-04 01:03:50 +0200744 assert(_PyUnicode_WSTR(unicode) != NULL);
745
746 /* check for integer overflow */
747 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
748 PyErr_NoMemory();
749 return -1;
750 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100751 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200752 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100753 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200754 if (!wstr) {
755 PyErr_NoMemory();
756 return -1;
757 }
758 _PyUnicode_WSTR(unicode) = wstr;
759 _PyUnicode_WSTR(unicode)[length] = 0;
760 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200761 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000762 return 0;
763}
764
Victor Stinnerfe226c02011-10-03 03:52:20 +0200765static PyObject*
766resize_copy(PyObject *unicode, Py_ssize_t length)
767{
768 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100769 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200770 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100771
Benjamin Petersonbac79492012-01-14 13:34:47 -0500772 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100773 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200774
775 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
776 if (copy == NULL)
777 return NULL;
778
779 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +0200780 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200781 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200782 }
783 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200784 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100785
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200786 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200787 if (w == NULL)
788 return NULL;
789 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
790 copy_length = Py_MIN(copy_length, length);
791 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
792 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200793 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200794 }
795}
796
Guido van Rossumd57fd912000-03-10 22:53:23 +0000797/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000798 Ux0000 terminated; some code (e.g. new_identifier)
799 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000800
801 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000802 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000803
804*/
805
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200806#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200807static int unicode_old_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200808#endif
809
Alexander Belopolsky40018472011-02-26 01:02:56 +0000810static PyUnicodeObject *
811_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000812{
813 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200814 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000815
Thomas Wouters477c8d52006-05-27 19:21:47 +0000816 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000817 if (length == 0 && unicode_empty != NULL) {
818 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200819 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000820 }
821
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000822 /* Ensure we won't overflow the size. */
823 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
824 return (PyUnicodeObject *)PyErr_NoMemory();
825 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200826 if (length < 0) {
827 PyErr_SetString(PyExc_SystemError,
828 "Negative size passed to _PyUnicode_New");
829 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000830 }
831
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200832#ifdef Py_DEBUG
833 ++unicode_old_new_calls;
834#endif
835
836 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
837 if (unicode == NULL)
838 return NULL;
839 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
840 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
841 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100842 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000843 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100844 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000845 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200846
Jeremy Hyltond8082792003-09-16 19:41:39 +0000847 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000848 * the caller fails before initializing str -- unicode_resize()
849 * reads str[0], and the Keep-Alive optimization can keep memory
850 * allocated for str alive across a call to unicode_dealloc(unicode).
851 * We don't want unicode_resize to read uninitialized memory in
852 * that case.
853 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200854 _PyUnicode_WSTR(unicode)[0] = 0;
855 _PyUnicode_WSTR(unicode)[length] = 0;
856 _PyUnicode_WSTR_LENGTH(unicode) = length;
857 _PyUnicode_HASH(unicode) = -1;
858 _PyUnicode_STATE(unicode).interned = 0;
859 _PyUnicode_STATE(unicode).kind = 0;
860 _PyUnicode_STATE(unicode).compact = 0;
861 _PyUnicode_STATE(unicode).ready = 0;
862 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200863 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200864 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200865 _PyUnicode_UTF8(unicode) = NULL;
866 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100867 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000868 return unicode;
869}
870
Victor Stinnerf42dc442011-10-02 23:33:16 +0200871static const char*
872unicode_kind_name(PyObject *unicode)
873{
Victor Stinner42dfd712011-10-03 14:41:45 +0200874 /* don't check consistency: unicode_kind_name() is called from
875 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200876 if (!PyUnicode_IS_COMPACT(unicode))
877 {
878 if (!PyUnicode_IS_READY(unicode))
879 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600880 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200881 {
882 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200883 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200884 return "legacy ascii";
885 else
886 return "legacy latin1";
887 case PyUnicode_2BYTE_KIND:
888 return "legacy UCS2";
889 case PyUnicode_4BYTE_KIND:
890 return "legacy UCS4";
891 default:
892 return "<legacy invalid kind>";
893 }
894 }
895 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600896 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200897 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200898 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200899 return "ascii";
900 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200901 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200902 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200903 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200904 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200905 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200906 default:
907 return "<invalid compact kind>";
908 }
909}
910
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200911#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200912static int unicode_new_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200913
914/* Functions wrapping macros for use in debugger */
915char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200916 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200917}
918
919void *_PyUnicode_compact_data(void *unicode) {
920 return _PyUnicode_COMPACT_DATA(unicode);
921}
922void *_PyUnicode_data(void *unicode){
923 printf("obj %p\n", unicode);
924 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
925 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
926 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
927 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
928 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
929 return PyUnicode_DATA(unicode);
930}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200931
932void
933_PyUnicode_Dump(PyObject *op)
934{
935 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200936 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
937 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
938 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200939
Victor Stinnera849a4b2011-10-03 12:12:11 +0200940 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200941 {
942 if (ascii->state.ascii)
943 data = (ascii + 1);
944 else
945 data = (compact + 1);
946 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200947 else
948 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200949 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
950
Victor Stinnera849a4b2011-10-03 12:12:11 +0200951 if (ascii->wstr == data)
952 printf("shared ");
953 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200954
Victor Stinnera3b334d2011-10-03 13:53:37 +0200955 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200956 printf(" (%zu), ", compact->wstr_length);
957 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
958 printf("shared ");
959 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200960 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200961 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200962}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200963#endif
964
965PyObject *
966PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
967{
968 PyObject *obj;
969 PyCompactUnicodeObject *unicode;
970 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +0200971 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200972 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200973 Py_ssize_t char_size;
974 Py_ssize_t struct_size;
975
976 /* Optimization for empty strings */
977 if (size == 0 && unicode_empty != NULL) {
978 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200979 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200980 }
981
982#ifdef Py_DEBUG
983 ++unicode_new_new_calls;
984#endif
985
Victor Stinner9e9d6892011-10-04 01:02:02 +0200986 is_ascii = 0;
987 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200988 struct_size = sizeof(PyCompactUnicodeObject);
989 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +0200990 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200991 char_size = 1;
992 is_ascii = 1;
993 struct_size = sizeof(PyASCIIObject);
994 }
995 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +0200996 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200997 char_size = 1;
998 }
999 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001000 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001001 char_size = 2;
1002 if (sizeof(wchar_t) == 2)
1003 is_sharing = 1;
1004 }
1005 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001006 if (maxchar > MAX_UNICODE) {
1007 PyErr_SetString(PyExc_SystemError,
1008 "invalid maximum character passed to PyUnicode_New");
1009 return NULL;
1010 }
Victor Stinner8f825062012-04-27 13:55:39 +02001011 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001012 char_size = 4;
1013 if (sizeof(wchar_t) == 4)
1014 is_sharing = 1;
1015 }
1016
1017 /* Ensure we won't overflow the size. */
1018 if (size < 0) {
1019 PyErr_SetString(PyExc_SystemError,
1020 "Negative size passed to PyUnicode_New");
1021 return NULL;
1022 }
1023 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1024 return PyErr_NoMemory();
1025
1026 /* Duplicated allocation code from _PyObject_New() instead of a call to
1027 * PyObject_New() so we are able to allocate space for the object and
1028 * it's data buffer.
1029 */
1030 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1031 if (obj == NULL)
1032 return PyErr_NoMemory();
1033 obj = PyObject_INIT(obj, &PyUnicode_Type);
1034 if (obj == NULL)
1035 return NULL;
1036
1037 unicode = (PyCompactUnicodeObject *)obj;
1038 if (is_ascii)
1039 data = ((PyASCIIObject*)obj) + 1;
1040 else
1041 data = unicode + 1;
1042 _PyUnicode_LENGTH(unicode) = size;
1043 _PyUnicode_HASH(unicode) = -1;
1044 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001045 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001046 _PyUnicode_STATE(unicode).compact = 1;
1047 _PyUnicode_STATE(unicode).ready = 1;
1048 _PyUnicode_STATE(unicode).ascii = is_ascii;
1049 if (is_ascii) {
1050 ((char*)data)[size] = 0;
1051 _PyUnicode_WSTR(unicode) = NULL;
1052 }
Victor Stinner8f825062012-04-27 13:55:39 +02001053 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001054 ((char*)data)[size] = 0;
1055 _PyUnicode_WSTR(unicode) = NULL;
1056 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001057 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001058 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001059 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001060 else {
1061 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001062 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001063 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001064 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001065 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001066 ((Py_UCS4*)data)[size] = 0;
1067 if (is_sharing) {
1068 _PyUnicode_WSTR_LENGTH(unicode) = size;
1069 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1070 }
1071 else {
1072 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1073 _PyUnicode_WSTR(unicode) = NULL;
1074 }
1075 }
Victor Stinner8f825062012-04-27 13:55:39 +02001076#ifdef Py_DEBUG
1077 /* Fill the data with invalid characters to detect bugs earlier.
1078 _PyUnicode_CheckConsistency(str, 1) detects invalid characters,
1079 at least for ASCII and UCS-4 strings. U+00FF is invalid in ASCII
1080 and U+FFFFFFFF is an invalid character in Unicode 6.0. */
1081 memset(data, 0xff, size * kind);
1082#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001083 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001084 return obj;
1085}
1086
1087#if SIZEOF_WCHAR_T == 2
1088/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1089 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001090 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001091
1092 This function assumes that unicode can hold one more code point than wstr
1093 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001094static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001095unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001096 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001097{
1098 const wchar_t *iter;
1099 Py_UCS4 *ucs4_out;
1100
Victor Stinner910337b2011-10-03 03:20:16 +02001101 assert(unicode != NULL);
1102 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001103 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1104 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1105
1106 for (iter = begin; iter < end; ) {
1107 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1108 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001109 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1110 && (iter+1) < end
1111 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001112 {
Victor Stinner551ac952011-11-29 22:58:13 +01001113 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001114 iter += 2;
1115 }
1116 else {
1117 *ucs4_out++ = *iter;
1118 iter++;
1119 }
1120 }
1121 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1122 _PyUnicode_GET_LENGTH(unicode)));
1123
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001124}
1125#endif
1126
Victor Stinnercd9950f2011-10-02 00:34:53 +02001127static int
Victor Stinner488fa492011-12-12 00:01:39 +01001128unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001129{
Victor Stinner488fa492011-12-12 00:01:39 +01001130 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001131 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001132 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001133 return -1;
1134 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001135 return 0;
1136}
1137
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001138static int
1139_copy_characters(PyObject *to, Py_ssize_t to_start,
1140 PyObject *from, Py_ssize_t from_start,
1141 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001142{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001143 unsigned int from_kind, to_kind;
1144 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001145 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001146
Victor Stinneree4544c2012-05-09 22:24:08 +02001147 assert(0 <= how_many);
1148 assert(0 <= from_start);
1149 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001150 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001151 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001152 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001153
Victor Stinnerf5ca1a22011-09-28 23:54:59 +02001154 if (how_many == 0)
1155 return 0;
1156
Victor Stinnerd3f08822012-05-29 12:57:52 +02001157 assert(PyUnicode_Check(to));
1158 assert(PyUnicode_IS_READY(to));
1159 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1160
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001161 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001162 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001163 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001164 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001165
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001166#ifdef Py_DEBUG
1167 if (!check_maxchar
1168 && (from_kind > to_kind
1169 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001170 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001171 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1172 Py_UCS4 ch;
1173 Py_ssize_t i;
1174 for (i=0; i < how_many; i++) {
1175 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1176 assert(ch <= to_maxchar);
1177 }
1178 }
1179#endif
1180 fast = (from_kind == to_kind);
1181 if (check_maxchar
1182 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1183 {
1184 /* deny latin1 => ascii */
1185 fast = 0;
1186 }
1187
1188 if (fast) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001189 Py_MEMCPY((char*)to_data + to_kind * to_start,
1190 (char*)from_data + from_kind * from_start,
1191 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001192 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001193 else if (from_kind == PyUnicode_1BYTE_KIND
1194 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001195 {
1196 _PyUnicode_CONVERT_BYTES(
1197 Py_UCS1, Py_UCS2,
1198 PyUnicode_1BYTE_DATA(from) + from_start,
1199 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1200 PyUnicode_2BYTE_DATA(to) + to_start
1201 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001202 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001203 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001204 && to_kind == PyUnicode_4BYTE_KIND)
1205 {
1206 _PyUnicode_CONVERT_BYTES(
1207 Py_UCS1, Py_UCS4,
1208 PyUnicode_1BYTE_DATA(from) + from_start,
1209 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1210 PyUnicode_4BYTE_DATA(to) + to_start
1211 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001212 }
1213 else if (from_kind == PyUnicode_2BYTE_KIND
1214 && to_kind == PyUnicode_4BYTE_KIND)
1215 {
1216 _PyUnicode_CONVERT_BYTES(
1217 Py_UCS2, Py_UCS4,
1218 PyUnicode_2BYTE_DATA(from) + from_start,
1219 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1220 PyUnicode_4BYTE_DATA(to) + to_start
1221 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001222 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001223 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001224 /* check if max_char(from substring) <= max_char(to) */
1225 if (from_kind > to_kind
1226 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001227 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001228 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001229 /* slow path to check for character overflow */
1230 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001231 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001232 Py_ssize_t i;
1233
Victor Stinner56c161a2011-10-06 02:47:11 +02001234#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001235 for (i=0; i < how_many; i++) {
1236 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001237 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001238 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1239 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001240#else
1241 if (!check_maxchar) {
1242 for (i=0; i < how_many; i++) {
1243 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1244 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1245 }
1246 }
1247 else {
1248 for (i=0; i < how_many; i++) {
1249 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1250 if (ch > to_maxchar)
1251 return 1;
1252 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1253 }
1254 }
1255#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001256 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001257 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001258 assert(0 && "inconsistent state");
1259 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001260 }
1261 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001262 return 0;
1263}
1264
Victor Stinnerd3f08822012-05-29 12:57:52 +02001265void
1266_PyUnicode_FastCopyCharacters(
1267 PyObject *to, Py_ssize_t to_start,
1268 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001269{
1270 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1271}
1272
1273Py_ssize_t
1274PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1275 PyObject *from, Py_ssize_t from_start,
1276 Py_ssize_t how_many)
1277{
1278 int err;
1279
1280 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1281 PyErr_BadInternalCall();
1282 return -1;
1283 }
1284
Benjamin Petersonbac79492012-01-14 13:34:47 -05001285 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001286 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001287 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001288 return -1;
1289
Victor Stinnerd3f08822012-05-29 12:57:52 +02001290 if (from_start < 0) {
1291 PyErr_SetString(PyExc_IndexError, "string index out of range");
1292 return -1;
1293 }
1294 if (to_start < 0) {
1295 PyErr_SetString(PyExc_IndexError, "string index out of range");
1296 return -1;
1297 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001298 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1299 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1300 PyErr_Format(PyExc_SystemError,
1301 "Cannot write %zi characters at %zi "
1302 "in a string of %zi characters",
1303 how_many, to_start, PyUnicode_GET_LENGTH(to));
1304 return -1;
1305 }
1306
1307 if (how_many == 0)
1308 return 0;
1309
Victor Stinner488fa492011-12-12 00:01:39 +01001310 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001311 return -1;
1312
1313 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1314 if (err) {
1315 PyErr_Format(PyExc_SystemError,
1316 "Cannot copy %s characters "
1317 "into a string of %s characters",
1318 unicode_kind_name(from),
1319 unicode_kind_name(to));
1320 return -1;
1321 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001322 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001323}
1324
Victor Stinner17222162011-09-28 22:15:37 +02001325/* Find the maximum code point and count the number of surrogate pairs so a
1326 correct string length can be computed before converting a string to UCS4.
1327 This function counts single surrogates as a character and not as a pair.
1328
1329 Return 0 on success, or -1 on error. */
1330static int
1331find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1332 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001333{
1334 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001335 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001336
Victor Stinnerc53be962011-10-02 21:33:54 +02001337 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001338 *num_surrogates = 0;
1339 *maxchar = 0;
1340
1341 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001342#if SIZEOF_WCHAR_T == 2
Victor Stinnerca4f2072011-11-22 03:38:40 +01001343 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1344 && (iter+1) < end
1345 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001346 {
Victor Stinner8faf8212011-12-08 22:14:11 +01001347 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001348 ++(*num_surrogates);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001349 iter += 2;
1350 }
1351 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001352#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001353 {
1354 ch = *iter;
1355 iter++;
1356 }
1357 if (ch > *maxchar) {
1358 *maxchar = ch;
1359 if (*maxchar > MAX_UNICODE) {
1360 PyErr_Format(PyExc_ValueError,
1361 "character U+%x is not in range [U+0000; U+10ffff]",
1362 ch);
1363 return -1;
1364 }
1365 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001366 }
1367 return 0;
1368}
1369
1370#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001371static int unicode_ready_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001372#endif
1373
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001374int
1375_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001376{
1377 wchar_t *end;
1378 Py_UCS4 maxchar = 0;
1379 Py_ssize_t num_surrogates;
1380#if SIZEOF_WCHAR_T == 2
1381 Py_ssize_t length_wo_surrogates;
1382#endif
1383
Georg Brandl7597add2011-10-05 16:36:47 +02001384 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001385 strings were created using _PyObject_New() and where no canonical
1386 representation (the str field) has been set yet aka strings
1387 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001388 assert(_PyUnicode_CHECK(unicode));
1389 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001390 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001391 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001392 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001393 /* Actually, it should neither be interned nor be anything else: */
1394 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001395
1396#ifdef Py_DEBUG
1397 ++unicode_ready_calls;
1398#endif
1399
1400 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001401 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001402 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001403 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001404
1405 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001406 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1407 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001408 PyErr_NoMemory();
1409 return -1;
1410 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001411 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001412 _PyUnicode_WSTR(unicode), end,
1413 PyUnicode_1BYTE_DATA(unicode));
1414 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1415 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1416 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1417 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001418 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001419 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001420 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001421 }
1422 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001423 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001424 _PyUnicode_UTF8(unicode) = NULL;
1425 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001426 }
1427 PyObject_FREE(_PyUnicode_WSTR(unicode));
1428 _PyUnicode_WSTR(unicode) = NULL;
1429 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1430 }
1431 /* In this case we might have to convert down from 4-byte native
1432 wchar_t to 2-byte unicode. */
1433 else if (maxchar < 65536) {
1434 assert(num_surrogates == 0 &&
1435 "FindMaxCharAndNumSurrogatePairs() messed up");
1436
Victor Stinner506f5922011-09-28 22:34:18 +02001437#if SIZEOF_WCHAR_T == 2
1438 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001439 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001440 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1441 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1442 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001443 _PyUnicode_UTF8(unicode) = NULL;
1444 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001445#else
1446 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001447 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001448 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001449 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001450 PyErr_NoMemory();
1451 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001452 }
Victor Stinner506f5922011-09-28 22:34:18 +02001453 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1454 _PyUnicode_WSTR(unicode), end,
1455 PyUnicode_2BYTE_DATA(unicode));
1456 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1457 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1458 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001459 _PyUnicode_UTF8(unicode) = NULL;
1460 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001461 PyObject_FREE(_PyUnicode_WSTR(unicode));
1462 _PyUnicode_WSTR(unicode) = NULL;
1463 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1464#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001465 }
1466 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1467 else {
1468#if SIZEOF_WCHAR_T == 2
1469 /* in case the native representation is 2-bytes, we need to allocate a
1470 new normalized 4-byte version. */
1471 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001472 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1473 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001474 PyErr_NoMemory();
1475 return -1;
1476 }
1477 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1478 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001479 _PyUnicode_UTF8(unicode) = NULL;
1480 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001481 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1482 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001483 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001484 PyObject_FREE(_PyUnicode_WSTR(unicode));
1485 _PyUnicode_WSTR(unicode) = NULL;
1486 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1487#else
1488 assert(num_surrogates == 0);
1489
Victor Stinnerc3c74152011-10-02 20:39:55 +02001490 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001491 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001492 _PyUnicode_UTF8(unicode) = NULL;
1493 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001494 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1495#endif
1496 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1497 }
1498 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001499 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001500 return 0;
1501}
1502
Alexander Belopolsky40018472011-02-26 01:02:56 +00001503static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001504unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001505{
Walter Dörwald16807132007-05-25 13:52:07 +00001506 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001507 case SSTATE_NOT_INTERNED:
1508 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001509
Benjamin Peterson29060642009-01-31 22:14:21 +00001510 case SSTATE_INTERNED_MORTAL:
1511 /* revive dead object temporarily for DelItem */
1512 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001513 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001514 Py_FatalError(
1515 "deletion of interned string failed");
1516 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001517
Benjamin Peterson29060642009-01-31 22:14:21 +00001518 case SSTATE_INTERNED_IMMORTAL:
1519 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001520
Benjamin Peterson29060642009-01-31 22:14:21 +00001521 default:
1522 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001523 }
1524
Victor Stinner03490912011-10-03 23:45:12 +02001525 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001526 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001527 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001528 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001529 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1530 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001531
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001532 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001533}
1534
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001535#ifdef Py_DEBUG
1536static int
1537unicode_is_singleton(PyObject *unicode)
1538{
1539 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1540 if (unicode == unicode_empty)
1541 return 1;
1542 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1543 {
1544 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1545 if (ch < 256 && unicode_latin1[ch] == unicode)
1546 return 1;
1547 }
1548 return 0;
1549}
1550#endif
1551
Alexander Belopolsky40018472011-02-26 01:02:56 +00001552static int
Victor Stinner488fa492011-12-12 00:01:39 +01001553unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001554{
Victor Stinner488fa492011-12-12 00:01:39 +01001555 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001556 if (Py_REFCNT(unicode) != 1)
1557 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001558 if (_PyUnicode_HASH(unicode) != -1)
1559 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001560 if (PyUnicode_CHECK_INTERNED(unicode))
1561 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001562 if (!PyUnicode_CheckExact(unicode))
1563 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001564#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001565 /* singleton refcount is greater than 1 */
1566 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001567#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001568 return 1;
1569}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001570
Victor Stinnerfe226c02011-10-03 03:52:20 +02001571static int
1572unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1573{
1574 PyObject *unicode;
1575 Py_ssize_t old_length;
1576
1577 assert(p_unicode != NULL);
1578 unicode = *p_unicode;
1579
1580 assert(unicode != NULL);
1581 assert(PyUnicode_Check(unicode));
1582 assert(0 <= length);
1583
Victor Stinner910337b2011-10-03 03:20:16 +02001584 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001585 old_length = PyUnicode_WSTR_LENGTH(unicode);
1586 else
1587 old_length = PyUnicode_GET_LENGTH(unicode);
1588 if (old_length == length)
1589 return 0;
1590
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001591 if (length == 0) {
1592 Py_DECREF(*p_unicode);
1593 *p_unicode = unicode_empty;
1594 Py_INCREF(*p_unicode);
1595 return 0;
1596 }
1597
Victor Stinner488fa492011-12-12 00:01:39 +01001598 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001599 PyObject *copy = resize_copy(unicode, length);
1600 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001601 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001602 Py_DECREF(*p_unicode);
1603 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001604 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001605 }
1606
Victor Stinnerfe226c02011-10-03 03:52:20 +02001607 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001608 PyObject *new_unicode = resize_compact(unicode, length);
1609 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001610 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001611 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001612 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001613 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001614 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001615}
1616
Alexander Belopolsky40018472011-02-26 01:02:56 +00001617int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001618PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001619{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001620 PyObject *unicode;
1621 if (p_unicode == NULL) {
1622 PyErr_BadInternalCall();
1623 return -1;
1624 }
1625 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001626 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001627 {
1628 PyErr_BadInternalCall();
1629 return -1;
1630 }
1631 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001632}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001633
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001634static int
Victor Stinner1b487b42012-05-03 12:29:04 +02001635unicode_widen(PyObject **p_unicode, Py_ssize_t length,
1636 unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001637{
1638 PyObject *result;
1639 assert(PyUnicode_IS_READY(*p_unicode));
Victor Stinner1b487b42012-05-03 12:29:04 +02001640 assert(length <= PyUnicode_GET_LENGTH(*p_unicode));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001641 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1642 return 0;
1643 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1644 maxchar);
1645 if (result == NULL)
1646 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +02001647 _PyUnicode_FastCopyCharacters(result, 0, *p_unicode, 0, length);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001648 Py_DECREF(*p_unicode);
1649 *p_unicode = result;
1650 return 0;
1651}
1652
1653static int
1654unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1655 Py_UCS4 ch)
1656{
Victor Stinner15e9ed22012-02-22 13:36:20 +01001657 assert(ch <= MAX_UNICODE);
Victor Stinner1b487b42012-05-03 12:29:04 +02001658 if (unicode_widen(p_unicode, *pos, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001659 return -1;
1660 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1661 PyUnicode_DATA(*p_unicode),
1662 (*pos)++, ch);
1663 return 0;
1664}
1665
Victor Stinnerc5166102012-02-22 13:55:02 +01001666/* Copy a ASCII or latin1 char* string into a Python Unicode string.
1667 Return the length of the input string.
1668
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001669 WARNING: The function doesn't copy the terminating null character and
1670 doesn't check the maximum character (may write a latin1 character in an
1671 ASCII string). */
Victor Stinnerc5166102012-02-22 13:55:02 +01001672static Py_ssize_t
1673unicode_write_cstr(PyObject *unicode, Py_ssize_t index, const char *str)
1674{
1675 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1676 void *data = PyUnicode_DATA(unicode);
1677
1678 switch (kind) {
1679 case PyUnicode_1BYTE_KIND: {
1680 Py_ssize_t len = strlen(str);
1681 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001682 memcpy((char *) data + index, str, len);
Victor Stinnerc5166102012-02-22 13:55:02 +01001683 return len;
1684 }
1685 case PyUnicode_2BYTE_KIND: {
1686 Py_UCS2 *start = (Py_UCS2 *)data + index;
1687 Py_UCS2 *ucs2 = start;
1688 assert(index <= PyUnicode_GET_LENGTH(unicode));
1689
1690 for (; *str; ++ucs2, ++str)
1691 *ucs2 = (Py_UCS2)*str;
1692
1693 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
1694 return ucs2 - start;
1695 }
1696 default: {
1697 Py_UCS4 *start = (Py_UCS4 *)data + index;
1698 Py_UCS4 *ucs4 = start;
1699 assert(kind == PyUnicode_4BYTE_KIND);
1700 assert(index <= PyUnicode_GET_LENGTH(unicode));
1701
1702 for (; *str; ++ucs4, ++str)
1703 *ucs4 = (Py_UCS4)*str;
1704
1705 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
1706 return ucs4 - start;
1707 }
1708 }
1709}
1710
1711
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001712static PyObject*
1713get_latin1_char(unsigned char ch)
1714{
Victor Stinnera464fc12011-10-02 20:39:30 +02001715 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001716 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001717 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001718 if (!unicode)
1719 return NULL;
1720 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001721 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001722 unicode_latin1[ch] = unicode;
1723 }
1724 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001725 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001726}
1727
Alexander Belopolsky40018472011-02-26 01:02:56 +00001728PyObject *
1729PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001730{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001731 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001732 Py_UCS4 maxchar = 0;
1733 Py_ssize_t num_surrogates;
1734
1735 if (u == NULL)
1736 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001737
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001738 /* If the Unicode data is known at construction time, we can apply
1739 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001740
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001741 /* Optimization for empty strings */
1742 if (size == 0 && unicode_empty != NULL) {
1743 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001744 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001745 }
Tim Petersced69f82003-09-16 20:30:58 +00001746
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001747 /* Single character Unicode objects in the Latin-1 range are
1748 shared when using this constructor */
1749 if (size == 1 && *u < 256)
1750 return get_latin1_char((unsigned char)*u);
1751
1752 /* If not empty and not single character, copy the Unicode data
1753 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001754 if (find_maxchar_surrogates(u, u + size,
1755 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001756 return NULL;
1757
Victor Stinner8faf8212011-12-08 22:14:11 +01001758 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001759 if (!unicode)
1760 return NULL;
1761
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001762 switch (PyUnicode_KIND(unicode)) {
1763 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001764 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001765 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1766 break;
1767 case PyUnicode_2BYTE_KIND:
1768#if Py_UNICODE_SIZE == 2
1769 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1770#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001771 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001772 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1773#endif
1774 break;
1775 case PyUnicode_4BYTE_KIND:
1776#if SIZEOF_WCHAR_T == 2
1777 /* This is the only case which has to process surrogates, thus
1778 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001779 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001780#else
1781 assert(num_surrogates == 0);
1782 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1783#endif
1784 break;
1785 default:
1786 assert(0 && "Impossible state");
1787 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001788
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001789 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001790}
1791
Alexander Belopolsky40018472011-02-26 01:02:56 +00001792PyObject *
1793PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001794{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001795 if (size < 0) {
1796 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001797 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001798 return NULL;
1799 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001800 if (u != NULL)
1801 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1802 else
1803 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001804}
1805
Alexander Belopolsky40018472011-02-26 01:02:56 +00001806PyObject *
1807PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001808{
1809 size_t size = strlen(u);
1810 if (size > PY_SSIZE_T_MAX) {
1811 PyErr_SetString(PyExc_OverflowError, "input too long");
1812 return NULL;
1813 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001814 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001815}
1816
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001817PyObject *
1818_PyUnicode_FromId(_Py_Identifier *id)
1819{
1820 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001821 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1822 strlen(id->string),
1823 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001824 if (!id->object)
1825 return NULL;
1826 PyUnicode_InternInPlace(&id->object);
1827 assert(!id->next);
1828 id->next = static_strings;
1829 static_strings = id;
1830 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001831 return id->object;
1832}
1833
1834void
1835_PyUnicode_ClearStaticStrings()
1836{
1837 _Py_Identifier *i;
1838 for (i = static_strings; i; i = i->next) {
1839 Py_DECREF(i->object);
1840 i->object = NULL;
1841 i->next = NULL;
1842 }
1843}
1844
Benjamin Peterson0df54292012-03-26 14:50:32 -04001845/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001846
Victor Stinnerd3f08822012-05-29 12:57:52 +02001847PyObject*
1848_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001849{
Victor Stinnerd3f08822012-05-29 12:57:52 +02001850 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01001851 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001852 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001853#ifdef Py_DEBUG
Victor Stinnere6b2d442011-12-11 21:54:30 +01001854 assert(s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001855#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001856 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001857 }
Victor Stinner785938e2011-12-11 20:09:03 +01001858 unicode = PyUnicode_New(size, 127);
1859 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001860 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001861 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1862 assert(_PyUnicode_CheckConsistency(unicode, 1));
1863 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001864}
1865
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001866static Py_UCS4
1867kind_maxchar_limit(unsigned int kind)
1868{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001869 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001870 case PyUnicode_1BYTE_KIND:
1871 return 0x80;
1872 case PyUnicode_2BYTE_KIND:
1873 return 0x100;
1874 case PyUnicode_4BYTE_KIND:
1875 return 0x10000;
1876 default:
1877 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001878 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001879 }
1880}
1881
Victor Stinnere6abb482012-05-02 01:15:40 +02001882Py_LOCAL_INLINE(Py_UCS4)
1883align_maxchar(Py_UCS4 maxchar)
1884{
1885 if (maxchar <= 127)
1886 return 127;
1887 else if (maxchar <= 255)
1888 return 255;
1889 else if (maxchar <= 65535)
1890 return 65535;
1891 else
1892 return MAX_UNICODE;
1893}
1894
Victor Stinner702c7342011-10-05 13:50:52 +02001895static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001896_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001897{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001898 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001899 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001900
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001901 if (size == 0) {
1902 Py_INCREF(unicode_empty);
1903 return unicode_empty;
1904 }
1905 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001906 if (size == 1)
1907 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001908
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001909 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001910 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001911 if (!res)
1912 return NULL;
1913 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001914 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001915 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001916}
1917
Victor Stinnere57b1c02011-09-28 22:20:48 +02001918static PyObject*
1919_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001920{
1921 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001922 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001923
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001924 if (size == 0) {
1925 Py_INCREF(unicode_empty);
1926 return unicode_empty;
1927 }
1928 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001929 if (size == 1) {
1930 Py_UCS4 ch = u[0];
1931 if (ch < 256)
1932 return get_latin1_char((unsigned char)ch);
1933
1934 res = PyUnicode_New(1, ch);
1935 if (res == NULL)
1936 return NULL;
1937 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1938 assert(_PyUnicode_CheckConsistency(res, 1));
1939 return res;
1940 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001941
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001942 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001943 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001944 if (!res)
1945 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001946 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001947 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001948 else {
1949 _PyUnicode_CONVERT_BYTES(
1950 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1951 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001952 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001953 return res;
1954}
1955
Victor Stinnere57b1c02011-09-28 22:20:48 +02001956static PyObject*
1957_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001958{
1959 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001960 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001961
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001962 if (size == 0) {
1963 Py_INCREF(unicode_empty);
1964 return unicode_empty;
1965 }
1966 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001967 if (size == 1) {
1968 Py_UCS4 ch = u[0];
1969 if (ch < 256)
1970 return get_latin1_char((unsigned char)ch);
1971
1972 res = PyUnicode_New(1, ch);
1973 if (res == NULL)
1974 return NULL;
1975 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1976 assert(_PyUnicode_CheckConsistency(res, 1));
1977 return res;
1978 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001979
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001980 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001981 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001982 if (!res)
1983 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001984 if (max_char < 256)
1985 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1986 PyUnicode_1BYTE_DATA(res));
1987 else if (max_char < 0x10000)
1988 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1989 PyUnicode_2BYTE_DATA(res));
1990 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001991 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001992 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001993 return res;
1994}
1995
1996PyObject*
1997PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1998{
Victor Stinnercfed46e2011-11-22 01:29:14 +01001999 if (size < 0) {
2000 PyErr_SetString(PyExc_ValueError, "size must be positive");
2001 return NULL;
2002 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002003 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002004 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002005 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002006 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002007 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002008 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002009 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002010 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002011 PyErr_SetString(PyExc_SystemError, "invalid kind");
2012 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002013 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002014}
2015
Victor Stinnerece58de2012-04-23 23:36:38 +02002016Py_UCS4
2017_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2018{
2019 enum PyUnicode_Kind kind;
2020 void *startptr, *endptr;
2021
2022 assert(PyUnicode_IS_READY(unicode));
2023 assert(0 <= start);
2024 assert(end <= PyUnicode_GET_LENGTH(unicode));
2025 assert(start <= end);
2026
2027 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2028 return PyUnicode_MAX_CHAR_VALUE(unicode);
2029
2030 if (start == end)
2031 return 127;
2032
Victor Stinner94d558b2012-04-27 22:26:58 +02002033 if (PyUnicode_IS_ASCII(unicode))
2034 return 127;
2035
Victor Stinnerece58de2012-04-23 23:36:38 +02002036 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002037 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002038 endptr = (char *)startptr + end * kind;
2039 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002040 switch(kind) {
2041 case PyUnicode_1BYTE_KIND:
2042 return ucs1lib_find_max_char(startptr, endptr);
2043 case PyUnicode_2BYTE_KIND:
2044 return ucs2lib_find_max_char(startptr, endptr);
2045 case PyUnicode_4BYTE_KIND:
2046 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002047 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002048 assert(0);
2049 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002050 }
2051}
2052
Victor Stinner25a4b292011-10-06 12:31:55 +02002053/* Ensure that a string uses the most efficient storage, if it is not the
2054 case: create a new string with of the right kind. Write NULL into *p_unicode
2055 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002056static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002057unicode_adjust_maxchar(PyObject **p_unicode)
2058{
2059 PyObject *unicode, *copy;
2060 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002061 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002062 unsigned int kind;
2063
2064 assert(p_unicode != NULL);
2065 unicode = *p_unicode;
2066 assert(PyUnicode_IS_READY(unicode));
2067 if (PyUnicode_IS_ASCII(unicode))
2068 return;
2069
2070 len = PyUnicode_GET_LENGTH(unicode);
2071 kind = PyUnicode_KIND(unicode);
2072 if (kind == PyUnicode_1BYTE_KIND) {
2073 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002074 max_char = ucs1lib_find_max_char(u, u + len);
2075 if (max_char >= 128)
2076 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002077 }
2078 else if (kind == PyUnicode_2BYTE_KIND) {
2079 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002080 max_char = ucs2lib_find_max_char(u, u + len);
2081 if (max_char >= 256)
2082 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002083 }
2084 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002085 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002086 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002087 max_char = ucs4lib_find_max_char(u, u + len);
2088 if (max_char >= 0x10000)
2089 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002090 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002091 copy = PyUnicode_New(len, max_char);
Victor Stinnerd3f08822012-05-29 12:57:52 +02002092 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002093 Py_DECREF(unicode);
2094 *p_unicode = copy;
2095}
2096
Victor Stinner034f6cf2011-09-30 02:26:44 +02002097PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002098_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002099{
Victor Stinner87af4f22011-11-21 23:03:47 +01002100 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002101 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002102
Victor Stinner034f6cf2011-09-30 02:26:44 +02002103 if (!PyUnicode_Check(unicode)) {
2104 PyErr_BadInternalCall();
2105 return NULL;
2106 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002107 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002108 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002109
Victor Stinner87af4f22011-11-21 23:03:47 +01002110 length = PyUnicode_GET_LENGTH(unicode);
2111 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002112 if (!copy)
2113 return NULL;
2114 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2115
Victor Stinner87af4f22011-11-21 23:03:47 +01002116 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2117 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002118 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002119 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002120}
2121
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002122
Victor Stinnerbc603d12011-10-02 01:00:40 +02002123/* Widen Unicode objects to larger buffers. Don't write terminating null
2124 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002125
2126void*
2127_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2128{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002129 Py_ssize_t len;
2130 void *result;
2131 unsigned int skind;
2132
Benjamin Petersonbac79492012-01-14 13:34:47 -05002133 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002134 return NULL;
2135
2136 len = PyUnicode_GET_LENGTH(s);
2137 skind = PyUnicode_KIND(s);
2138 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002139 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002140 return NULL;
2141 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002142 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002143 case PyUnicode_2BYTE_KIND:
2144 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2145 if (!result)
2146 return PyErr_NoMemory();
2147 assert(skind == PyUnicode_1BYTE_KIND);
2148 _PyUnicode_CONVERT_BYTES(
2149 Py_UCS1, Py_UCS2,
2150 PyUnicode_1BYTE_DATA(s),
2151 PyUnicode_1BYTE_DATA(s) + len,
2152 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002153 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002154 case PyUnicode_4BYTE_KIND:
2155 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2156 if (!result)
2157 return PyErr_NoMemory();
2158 if (skind == PyUnicode_2BYTE_KIND) {
2159 _PyUnicode_CONVERT_BYTES(
2160 Py_UCS2, Py_UCS4,
2161 PyUnicode_2BYTE_DATA(s),
2162 PyUnicode_2BYTE_DATA(s) + len,
2163 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002164 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002165 else {
2166 assert(skind == PyUnicode_1BYTE_KIND);
2167 _PyUnicode_CONVERT_BYTES(
2168 Py_UCS1, Py_UCS4,
2169 PyUnicode_1BYTE_DATA(s),
2170 PyUnicode_1BYTE_DATA(s) + len,
2171 result);
2172 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002173 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002174 default:
2175 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002176 }
Victor Stinner01698042011-10-04 00:04:26 +02002177 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002178 return NULL;
2179}
2180
2181static Py_UCS4*
2182as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2183 int copy_null)
2184{
2185 int kind;
2186 void *data;
2187 Py_ssize_t len, targetlen;
2188 if (PyUnicode_READY(string) == -1)
2189 return NULL;
2190 kind = PyUnicode_KIND(string);
2191 data = PyUnicode_DATA(string);
2192 len = PyUnicode_GET_LENGTH(string);
2193 targetlen = len;
2194 if (copy_null)
2195 targetlen++;
2196 if (!target) {
2197 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2198 PyErr_NoMemory();
2199 return NULL;
2200 }
2201 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2202 if (!target) {
2203 PyErr_NoMemory();
2204 return NULL;
2205 }
2206 }
2207 else {
2208 if (targetsize < targetlen) {
2209 PyErr_Format(PyExc_SystemError,
2210 "string is longer than the buffer");
2211 if (copy_null && 0 < targetsize)
2212 target[0] = 0;
2213 return NULL;
2214 }
2215 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002216 if (kind == PyUnicode_1BYTE_KIND) {
2217 Py_UCS1 *start = (Py_UCS1 *) data;
2218 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002219 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002220 else if (kind == PyUnicode_2BYTE_KIND) {
2221 Py_UCS2 *start = (Py_UCS2 *) data;
2222 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2223 }
2224 else {
2225 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002226 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002227 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002228 if (copy_null)
2229 target[len] = 0;
2230 return target;
2231}
2232
2233Py_UCS4*
2234PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2235 int copy_null)
2236{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002237 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002238 PyErr_BadInternalCall();
2239 return NULL;
2240 }
2241 return as_ucs4(string, target, targetsize, copy_null);
2242}
2243
2244Py_UCS4*
2245PyUnicode_AsUCS4Copy(PyObject *string)
2246{
2247 return as_ucs4(string, NULL, 0, 1);
2248}
2249
2250#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002251
Alexander Belopolsky40018472011-02-26 01:02:56 +00002252PyObject *
2253PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002254{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002255 if (w == NULL) {
Victor Stinner382955f2011-12-11 21:44:00 +01002256 if (size == 0) {
2257 Py_INCREF(unicode_empty);
2258 return unicode_empty;
2259 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002260 PyErr_BadInternalCall();
2261 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002262 }
2263
Martin v. Löwis790465f2008-04-05 20:41:37 +00002264 if (size == -1) {
2265 size = wcslen(w);
2266 }
2267
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002268 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002269}
2270
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002271#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002272
Walter Dörwald346737f2007-05-31 10:44:43 +00002273static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002274makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2275 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002276{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002277 *fmt++ = '%';
2278 if (width) {
2279 if (zeropad)
2280 *fmt++ = '0';
2281 fmt += sprintf(fmt, "%d", width);
2282 }
2283 if (precision)
2284 fmt += sprintf(fmt, ".%d", precision);
2285 if (longflag)
2286 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002287 else if (longlongflag) {
2288 /* longlongflag should only ever be nonzero on machines with
2289 HAVE_LONG_LONG defined */
2290#ifdef HAVE_LONG_LONG
2291 char *f = PY_FORMAT_LONG_LONG;
2292 while (*f)
2293 *fmt++ = *f++;
2294#else
2295 /* we shouldn't ever get here */
2296 assert(0);
2297 *fmt++ = 'l';
2298#endif
2299 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002300 else if (size_tflag) {
2301 char *f = PY_FORMAT_SIZE_T;
2302 while (*f)
2303 *fmt++ = *f++;
2304 }
2305 *fmt++ = c;
2306 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002307}
2308
Victor Stinner96865452011-03-01 23:44:09 +00002309/* helper for PyUnicode_FromFormatV() */
2310
2311static const char*
2312parse_format_flags(const char *f,
2313 int *p_width, int *p_precision,
2314 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2315{
2316 int width, precision, longflag, longlongflag, size_tflag;
2317
2318 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2319 f++;
2320 width = 0;
2321 while (Py_ISDIGIT((unsigned)*f))
2322 width = (width*10) + *f++ - '0';
2323 precision = 0;
2324 if (*f == '.') {
2325 f++;
2326 while (Py_ISDIGIT((unsigned)*f))
2327 precision = (precision*10) + *f++ - '0';
2328 if (*f == '%') {
2329 /* "%.3%s" => f points to "3" */
2330 f--;
2331 }
2332 }
2333 if (*f == '\0') {
2334 /* bogus format "%.1" => go backward, f points to "1" */
2335 f--;
2336 }
2337 if (p_width != NULL)
2338 *p_width = width;
2339 if (p_precision != NULL)
2340 *p_precision = precision;
2341
2342 /* Handle %ld, %lu, %lld and %llu. */
2343 longflag = 0;
2344 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002345 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002346
2347 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002348 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002349 longflag = 1;
2350 ++f;
2351 }
2352#ifdef HAVE_LONG_LONG
2353 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002354 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002355 longlongflag = 1;
2356 f += 2;
2357 }
2358#endif
2359 }
2360 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002361 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002362 size_tflag = 1;
2363 ++f;
2364 }
2365 if (p_longflag != NULL)
2366 *p_longflag = longflag;
2367 if (p_longlongflag != NULL)
2368 *p_longlongflag = longlongflag;
2369 if (p_size_tflag != NULL)
2370 *p_size_tflag = size_tflag;
2371 return f;
2372}
2373
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002374/* maximum number of characters required for output of %ld. 21 characters
2375 allows for 64-bit integers (in decimal) and an optional sign. */
2376#define MAX_LONG_CHARS 21
2377/* maximum number of characters required for output of %lld.
2378 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2379 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2380#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2381
Walter Dörwaldd2034312007-05-18 16:29:38 +00002382PyObject *
2383PyUnicode_FromFormatV(const char *format, va_list vargs)
2384{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002385 va_list count;
2386 Py_ssize_t callcount = 0;
2387 PyObject **callresults = NULL;
2388 PyObject **callresult = NULL;
2389 Py_ssize_t n = 0;
2390 int width = 0;
2391 int precision = 0;
2392 int zeropad;
2393 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002394 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002395 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002396 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002397 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2398 Py_UCS4 argmaxchar;
2399 Py_ssize_t numbersize = 0;
2400 char *numberresults = NULL;
2401 char *numberresult = NULL;
2402 Py_ssize_t i;
2403 int kind;
2404 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002405
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002406 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002407 /* step 1: count the number of %S/%R/%A/%s format specifications
2408 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2409 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002410 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002411 * also estimate a upper bound for all the number formats in the string,
2412 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002413 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002414 for (f = format; *f; f++) {
2415 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002416 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002417 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2418 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2419 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2420 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002421
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002422 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002423#ifdef HAVE_LONG_LONG
2424 if (longlongflag) {
2425 if (width < MAX_LONG_LONG_CHARS)
2426 width = MAX_LONG_LONG_CHARS;
2427 }
2428 else
2429#endif
2430 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2431 including sign. Decimal takes the most space. This
2432 isn't enough for octal. If a width is specified we
2433 need more (which we allocate later). */
2434 if (width < MAX_LONG_CHARS)
2435 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002436
2437 /* account for the size + '\0' to separate numbers
2438 inside of the numberresults buffer */
2439 numbersize += (width + 1);
2440 }
2441 }
2442 else if ((unsigned char)*f > 127) {
2443 PyErr_Format(PyExc_ValueError,
2444 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2445 "string, got a non-ASCII byte: 0x%02x",
2446 (unsigned char)*f);
2447 return NULL;
2448 }
2449 }
2450 /* step 2: allocate memory for the results of
2451 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2452 if (callcount) {
2453 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2454 if (!callresults) {
2455 PyErr_NoMemory();
2456 return NULL;
2457 }
2458 callresult = callresults;
2459 }
2460 /* step 2.5: allocate memory for the results of formating numbers */
2461 if (numbersize) {
2462 numberresults = PyObject_Malloc(numbersize);
2463 if (!numberresults) {
2464 PyErr_NoMemory();
2465 goto fail;
2466 }
2467 numberresult = numberresults;
2468 }
2469
2470 /* step 3: format numbers and figure out how large a buffer we need */
2471 for (f = format; *f; f++) {
2472 if (*f == '%') {
2473 const char* p;
2474 int longflag;
2475 int longlongflag;
2476 int size_tflag;
2477 int numprinted;
2478
2479 p = f;
2480 zeropad = (f[1] == '0');
2481 f = parse_format_flags(f, &width, &precision,
2482 &longflag, &longlongflag, &size_tflag);
2483 switch (*f) {
2484 case 'c':
2485 {
2486 Py_UCS4 ordinal = va_arg(count, int);
Victor Stinnere6abb482012-05-02 01:15:40 +02002487 maxchar = MAX_MAXCHAR(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002488 n++;
2489 break;
2490 }
2491 case '%':
2492 n++;
2493 break;
2494 case 'i':
2495 case 'd':
2496 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2497 width, precision, *f);
2498 if (longflag)
2499 numprinted = sprintf(numberresult, fmt,
2500 va_arg(count, long));
2501#ifdef HAVE_LONG_LONG
2502 else if (longlongflag)
2503 numprinted = sprintf(numberresult, fmt,
2504 va_arg(count, PY_LONG_LONG));
2505#endif
2506 else if (size_tflag)
2507 numprinted = sprintf(numberresult, fmt,
2508 va_arg(count, Py_ssize_t));
2509 else
2510 numprinted = sprintf(numberresult, fmt,
2511 va_arg(count, int));
2512 n += numprinted;
2513 /* advance by +1 to skip over the '\0' */
2514 numberresult += (numprinted + 1);
2515 assert(*(numberresult - 1) == '\0');
2516 assert(*(numberresult - 2) != '\0');
2517 assert(numprinted >= 0);
2518 assert(numberresult <= numberresults + numbersize);
2519 break;
2520 case 'u':
2521 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2522 width, precision, 'u');
2523 if (longflag)
2524 numprinted = sprintf(numberresult, fmt,
2525 va_arg(count, unsigned long));
2526#ifdef HAVE_LONG_LONG
2527 else if (longlongflag)
2528 numprinted = sprintf(numberresult, fmt,
2529 va_arg(count, unsigned PY_LONG_LONG));
2530#endif
2531 else if (size_tflag)
2532 numprinted = sprintf(numberresult, fmt,
2533 va_arg(count, size_t));
2534 else
2535 numprinted = sprintf(numberresult, fmt,
2536 va_arg(count, unsigned int));
2537 n += numprinted;
2538 numberresult += (numprinted + 1);
2539 assert(*(numberresult - 1) == '\0');
2540 assert(*(numberresult - 2) != '\0');
2541 assert(numprinted >= 0);
2542 assert(numberresult <= numberresults + numbersize);
2543 break;
2544 case 'x':
2545 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2546 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2547 n += numprinted;
2548 numberresult += (numprinted + 1);
2549 assert(*(numberresult - 1) == '\0');
2550 assert(*(numberresult - 2) != '\0');
2551 assert(numprinted >= 0);
2552 assert(numberresult <= numberresults + numbersize);
2553 break;
2554 case 'p':
2555 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2556 /* %p is ill-defined: ensure leading 0x. */
2557 if (numberresult[1] == 'X')
2558 numberresult[1] = 'x';
2559 else if (numberresult[1] != 'x') {
2560 memmove(numberresult + 2, numberresult,
2561 strlen(numberresult) + 1);
2562 numberresult[0] = '0';
2563 numberresult[1] = 'x';
2564 numprinted += 2;
2565 }
2566 n += numprinted;
2567 numberresult += (numprinted + 1);
2568 assert(*(numberresult - 1) == '\0');
2569 assert(*(numberresult - 2) != '\0');
2570 assert(numprinted >= 0);
2571 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002572 break;
2573 case 's':
2574 {
2575 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002576 const char *s = va_arg(count, const char*);
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002577 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002578 if (!str)
2579 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002580 /* since PyUnicode_DecodeUTF8 returns already flexible
2581 unicode objects, there is no need to call ready on them */
2582 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Victor Stinnere6abb482012-05-02 01:15:40 +02002583 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002584 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002585 /* Remember the str and switch to the next slot */
2586 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002587 break;
2588 }
2589 case 'U':
2590 {
2591 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002592 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002593 if (PyUnicode_READY(obj) == -1)
2594 goto fail;
2595 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Victor Stinnere6abb482012-05-02 01:15:40 +02002596 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002597 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002598 break;
2599 }
2600 case 'V':
2601 {
2602 PyObject *obj = va_arg(count, PyObject *);
2603 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002604 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002605 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002606 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002607 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002608 if (PyUnicode_READY(obj) == -1)
2609 goto fail;
2610 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Victor Stinnere6abb482012-05-02 01:15:40 +02002611 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002612 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002613 *callresult++ = NULL;
2614 }
2615 else {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002616 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002617 if (!str_obj)
2618 goto fail;
Benjamin Petersonbac79492012-01-14 13:34:47 -05002619 if (PyUnicode_READY(str_obj) == -1) {
Victor Stinnere1335c72011-10-04 20:53:03 +02002620 Py_DECREF(str_obj);
2621 goto fail;
2622 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002623 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Victor Stinnere6abb482012-05-02 01:15:40 +02002624 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002625 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002626 *callresult++ = str_obj;
2627 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002628 break;
2629 }
2630 case 'S':
2631 {
2632 PyObject *obj = va_arg(count, PyObject *);
2633 PyObject *str;
2634 assert(obj);
2635 str = PyObject_Str(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002636 if (!str)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002637 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002638 if (PyUnicode_READY(str) == -1) {
2639 Py_DECREF(str);
2640 goto fail;
2641 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002642 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Victor Stinnere6abb482012-05-02 01:15:40 +02002643 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002644 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002645 /* Remember the str and switch to the next slot */
2646 *callresult++ = str;
2647 break;
2648 }
2649 case 'R':
2650 {
2651 PyObject *obj = va_arg(count, PyObject *);
2652 PyObject *repr;
2653 assert(obj);
2654 repr = PyObject_Repr(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002655 if (!repr)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002656 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002657 if (PyUnicode_READY(repr) == -1) {
2658 Py_DECREF(repr);
2659 goto fail;
2660 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002661 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Victor Stinnere6abb482012-05-02 01:15:40 +02002662 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002663 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002664 /* Remember the repr and switch to the next slot */
2665 *callresult++ = repr;
2666 break;
2667 }
2668 case 'A':
2669 {
2670 PyObject *obj = va_arg(count, PyObject *);
2671 PyObject *ascii;
2672 assert(obj);
2673 ascii = PyObject_ASCII(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002674 if (!ascii)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002675 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002676 if (PyUnicode_READY(ascii) == -1) {
2677 Py_DECREF(ascii);
2678 goto fail;
2679 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002680 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Victor Stinnere6abb482012-05-02 01:15:40 +02002681 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002682 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002683 /* Remember the repr and switch to the next slot */
2684 *callresult++ = ascii;
2685 break;
2686 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002687 default:
2688 /* if we stumble upon an unknown
2689 formatting code, copy the rest of
2690 the format string to the output
2691 string. (we cannot just skip the
2692 code, since there's no way to know
2693 what's in the argument list) */
2694 n += strlen(p);
2695 goto expand;
2696 }
2697 } else
2698 n++;
2699 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002700 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002701 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002702 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002703 we don't have to resize the string.
2704 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002705 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002706 if (!string)
2707 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002708 kind = PyUnicode_KIND(string);
2709 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002710 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002711 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002712
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002713 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002714 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002715 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002716
2717 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002718 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2719 /* checking for == because the last argument could be a empty
2720 string, which causes i to point to end, the assert at the end of
2721 the loop */
2722 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002723
Benjamin Peterson14339b62009-01-31 16:36:08 +00002724 switch (*f) {
2725 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002726 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002727 const int ordinal = va_arg(vargs, int);
2728 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002729 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002730 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002731 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002732 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002733 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002734 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002735 case 'p':
Victor Stinnerc5166102012-02-22 13:55:02 +01002736 {
2737 Py_ssize_t written;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002738 /* unused, since we already have the result */
2739 if (*f == 'p')
2740 (void) va_arg(vargs, void *);
2741 else
2742 (void) va_arg(vargs, int);
2743 /* extract the result from numberresults and append. */
Victor Stinnerc5166102012-02-22 13:55:02 +01002744 written = unicode_write_cstr(string, i, numberresult);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002745 /* skip over the separating '\0' */
Victor Stinnerc5166102012-02-22 13:55:02 +01002746 i += written;
2747 numberresult += written;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002748 assert(*numberresult == '\0');
2749 numberresult++;
2750 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002751 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002752 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002753 case 's':
2754 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002755 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002756 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002757 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002758 size = PyUnicode_GET_LENGTH(*callresult);
2759 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerd3f08822012-05-29 12:57:52 +02002760 _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002761 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002762 /* We're done with the unicode()/repr() => forget it */
2763 Py_DECREF(*callresult);
2764 /* switch to next unicode()/repr() result */
2765 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002766 break;
2767 }
2768 case 'U':
2769 {
2770 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002771 Py_ssize_t size;
2772 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2773 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerd3f08822012-05-29 12:57:52 +02002774 _PyUnicode_FastCopyCharacters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002775 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002776 break;
2777 }
2778 case 'V':
2779 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002780 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002781 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002782 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002783 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002784 size = PyUnicode_GET_LENGTH(obj);
2785 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerd3f08822012-05-29 12:57:52 +02002786 _PyUnicode_FastCopyCharacters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002787 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002788 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002789 size = PyUnicode_GET_LENGTH(*callresult);
2790 assert(PyUnicode_KIND(*callresult) <=
2791 PyUnicode_KIND(string));
Victor Stinnerd3f08822012-05-29 12:57:52 +02002792 _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002793 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002794 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002795 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002796 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002797 break;
2798 }
2799 case 'S':
2800 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002801 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002802 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002803 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002804 /* unused, since we already have the result */
2805 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002806 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerd3f08822012-05-29 12:57:52 +02002807 _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002808 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002809 /* We're done with the unicode()/repr() => forget it */
2810 Py_DECREF(*callresult);
2811 /* switch to next unicode()/repr() result */
2812 ++callresult;
2813 break;
2814 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002815 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002816 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002817 break;
2818 default:
Victor Stinnerc5166102012-02-22 13:55:02 +01002819 i += unicode_write_cstr(string, i, p);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002820 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002821 goto end;
2822 }
Victor Stinner1205f272010-09-11 00:54:47 +00002823 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002824 else {
2825 assert(i < PyUnicode_GET_LENGTH(string));
2826 PyUnicode_WRITE(kind, data, i++, *f);
2827 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002828 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002829 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002830
Benjamin Peterson29060642009-01-31 22:14:21 +00002831 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002832 if (callresults)
2833 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002834 if (numberresults)
2835 PyObject_Free(numberresults);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002836 return unicode_result(string);
Benjamin Peterson29060642009-01-31 22:14:21 +00002837 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002838 if (callresults) {
2839 PyObject **callresult2 = callresults;
2840 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002841 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002842 ++callresult2;
2843 }
2844 PyObject_Free(callresults);
2845 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002846 if (numberresults)
2847 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002848 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002849}
2850
Walter Dörwaldd2034312007-05-18 16:29:38 +00002851PyObject *
2852PyUnicode_FromFormat(const char *format, ...)
2853{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002854 PyObject* ret;
2855 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002856
2857#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002858 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002859#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002860 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002861#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002862 ret = PyUnicode_FromFormatV(format, vargs);
2863 va_end(vargs);
2864 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002865}
2866
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002867#ifdef HAVE_WCHAR_H
2868
Victor Stinner5593d8a2010-10-02 11:11:27 +00002869/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2870 convert a Unicode object to a wide character string.
2871
Victor Stinnerd88d9832011-09-06 02:00:05 +02002872 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002873 character) required to convert the unicode object. Ignore size argument.
2874
Victor Stinnerd88d9832011-09-06 02:00:05 +02002875 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002876 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002877 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002878static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002879unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002880 wchar_t *w,
2881 Py_ssize_t size)
2882{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002883 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002884 const wchar_t *wstr;
2885
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002886 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002887 if (wstr == NULL)
2888 return -1;
2889
Victor Stinner5593d8a2010-10-02 11:11:27 +00002890 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002891 if (size > res)
2892 size = res + 1;
2893 else
2894 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002895 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002896 return res;
2897 }
2898 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002899 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002900}
2901
2902Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002903PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002904 wchar_t *w,
2905 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002906{
2907 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002908 PyErr_BadInternalCall();
2909 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002910 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002911 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002912}
2913
Victor Stinner137c34c2010-09-29 10:25:54 +00002914wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002915PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002916 Py_ssize_t *size)
2917{
2918 wchar_t* buffer;
2919 Py_ssize_t buflen;
2920
2921 if (unicode == NULL) {
2922 PyErr_BadInternalCall();
2923 return NULL;
2924 }
2925
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002926 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002927 if (buflen == -1)
2928 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002929 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002930 PyErr_NoMemory();
2931 return NULL;
2932 }
2933
Victor Stinner137c34c2010-09-29 10:25:54 +00002934 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2935 if (buffer == NULL) {
2936 PyErr_NoMemory();
2937 return NULL;
2938 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002939 buflen = unicode_aswidechar(unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002940 if (buflen == -1)
2941 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002942 if (size != NULL)
2943 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002944 return buffer;
2945}
2946
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002947#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002948
Alexander Belopolsky40018472011-02-26 01:02:56 +00002949PyObject *
2950PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002951{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002952 PyObject *v;
Victor Stinner8faf8212011-12-08 22:14:11 +01002953 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002954 PyErr_SetString(PyExc_ValueError,
2955 "chr() arg not in range(0x110000)");
2956 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002957 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002958
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002959 if (ordinal < 256)
2960 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002961
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002962 v = PyUnicode_New(1, ordinal);
2963 if (v == NULL)
2964 return NULL;
2965 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002966 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002967 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002968}
2969
Alexander Belopolsky40018472011-02-26 01:02:56 +00002970PyObject *
2971PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002972{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002973 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002974 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002975 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002976 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002977 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002978 Py_INCREF(obj);
2979 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002980 }
2981 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002982 /* For a Unicode subtype that's not a Unicode object,
2983 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002984 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002985 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002986 PyErr_Format(PyExc_TypeError,
2987 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002988 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002989 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002990}
2991
Alexander Belopolsky40018472011-02-26 01:02:56 +00002992PyObject *
2993PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002994 const char *encoding,
2995 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002996{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002997 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002998 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002999
Guido van Rossumd57fd912000-03-10 22:53:23 +00003000 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003001 PyErr_BadInternalCall();
3002 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003003 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003004
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003005 /* Decoding bytes objects is the most common case and should be fast */
3006 if (PyBytes_Check(obj)) {
3007 if (PyBytes_GET_SIZE(obj) == 0) {
3008 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02003009 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003010 }
3011 else {
3012 v = PyUnicode_Decode(
3013 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3014 encoding, errors);
3015 }
3016 return v;
3017 }
3018
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003019 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003020 PyErr_SetString(PyExc_TypeError,
3021 "decoding str is not supported");
3022 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003023 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003024
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003025 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3026 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3027 PyErr_Format(PyExc_TypeError,
3028 "coercing to str: need bytes, bytearray "
3029 "or buffer-like object, %.80s found",
3030 Py_TYPE(obj)->tp_name);
3031 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003032 }
Tim Petersced69f82003-09-16 20:30:58 +00003033
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003034 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003035 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02003036 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003037 }
Tim Petersced69f82003-09-16 20:30:58 +00003038 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003039 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003040
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003041 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003042 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003043}
3044
Victor Stinner600d3be2010-06-10 12:00:55 +00003045/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00003046 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
3047 1 on success. */
3048static int
3049normalize_encoding(const char *encoding,
3050 char *lower,
3051 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003052{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003053 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003054 char *l;
3055 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003056
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04003057 if (encoding == NULL) {
3058 strcpy(lower, "utf-8");
3059 return 1;
3060 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003061 e = encoding;
3062 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003063 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00003064 while (*e) {
3065 if (l == l_end)
3066 return 0;
David Malcolm96960882010-11-05 17:23:41 +00003067 if (Py_ISUPPER(*e)) {
3068 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003069 }
3070 else if (*e == '_') {
3071 *l++ = '-';
3072 e++;
3073 }
3074 else {
3075 *l++ = *e++;
3076 }
3077 }
3078 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003079 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003080}
3081
Alexander Belopolsky40018472011-02-26 01:02:56 +00003082PyObject *
3083PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003084 Py_ssize_t size,
3085 const char *encoding,
3086 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003087{
3088 PyObject *buffer = NULL, *unicode;
3089 Py_buffer info;
3090 char lower[11]; /* Enough for any encoding shortcut */
3091
Fred Drakee4315f52000-05-09 19:53:39 +00003092 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003093 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003094 if ((strcmp(lower, "utf-8") == 0) ||
3095 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003096 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00003097 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003098 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003099 (strcmp(lower, "iso-8859-1") == 0))
3100 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003101#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003102 else if (strcmp(lower, "mbcs") == 0)
3103 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003104#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003105 else if (strcmp(lower, "ascii") == 0)
3106 return PyUnicode_DecodeASCII(s, size, errors);
3107 else if (strcmp(lower, "utf-16") == 0)
3108 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3109 else if (strcmp(lower, "utf-32") == 0)
3110 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3111 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003112
3113 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003114 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003115 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003116 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003117 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003118 if (buffer == NULL)
3119 goto onError;
3120 unicode = PyCodec_Decode(buffer, encoding, errors);
3121 if (unicode == NULL)
3122 goto onError;
3123 if (!PyUnicode_Check(unicode)) {
3124 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003125 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00003126 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003127 Py_DECREF(unicode);
3128 goto onError;
3129 }
3130 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003131 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003132
Benjamin Peterson29060642009-01-31 22:14:21 +00003133 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003134 Py_XDECREF(buffer);
3135 return NULL;
3136}
3137
Alexander Belopolsky40018472011-02-26 01:02:56 +00003138PyObject *
3139PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003140 const char *encoding,
3141 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003142{
3143 PyObject *v;
3144
3145 if (!PyUnicode_Check(unicode)) {
3146 PyErr_BadArgument();
3147 goto onError;
3148 }
3149
3150 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003151 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003152
3153 /* Decode via the codec registry */
3154 v = PyCodec_Decode(unicode, encoding, errors);
3155 if (v == NULL)
3156 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003157 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003158
Benjamin Peterson29060642009-01-31 22:14:21 +00003159 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003160 return NULL;
3161}
3162
Alexander Belopolsky40018472011-02-26 01:02:56 +00003163PyObject *
3164PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003165 const char *encoding,
3166 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003167{
3168 PyObject *v;
3169
3170 if (!PyUnicode_Check(unicode)) {
3171 PyErr_BadArgument();
3172 goto onError;
3173 }
3174
3175 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003176 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003177
3178 /* Decode via the codec registry */
3179 v = PyCodec_Decode(unicode, encoding, errors);
3180 if (v == NULL)
3181 goto onError;
3182 if (!PyUnicode_Check(v)) {
3183 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003184 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003185 Py_TYPE(v)->tp_name);
3186 Py_DECREF(v);
3187 goto onError;
3188 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003189 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003190
Benjamin Peterson29060642009-01-31 22:14:21 +00003191 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003192 return NULL;
3193}
3194
Alexander Belopolsky40018472011-02-26 01:02:56 +00003195PyObject *
3196PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003197 Py_ssize_t size,
3198 const char *encoding,
3199 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003200{
3201 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003202
Guido van Rossumd57fd912000-03-10 22:53:23 +00003203 unicode = PyUnicode_FromUnicode(s, size);
3204 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003205 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003206 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3207 Py_DECREF(unicode);
3208 return v;
3209}
3210
Alexander Belopolsky40018472011-02-26 01:02:56 +00003211PyObject *
3212PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003213 const char *encoding,
3214 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003215{
3216 PyObject *v;
3217
3218 if (!PyUnicode_Check(unicode)) {
3219 PyErr_BadArgument();
3220 goto onError;
3221 }
3222
3223 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003224 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003225
3226 /* Encode via the codec registry */
3227 v = PyCodec_Encode(unicode, encoding, errors);
3228 if (v == NULL)
3229 goto onError;
3230 return v;
3231
Benjamin Peterson29060642009-01-31 22:14:21 +00003232 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003233 return NULL;
3234}
3235
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003236static size_t
3237wcstombs_errorpos(const wchar_t *wstr)
3238{
3239 size_t len;
3240#if SIZEOF_WCHAR_T == 2
3241 wchar_t buf[3];
3242#else
3243 wchar_t buf[2];
3244#endif
3245 char outbuf[MB_LEN_MAX];
3246 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003247
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003248#if SIZEOF_WCHAR_T == 2
3249 buf[2] = 0;
3250#else
3251 buf[1] = 0;
3252#endif
3253 start = wstr;
3254 while (*wstr != L'\0')
3255 {
3256 previous = wstr;
3257#if SIZEOF_WCHAR_T == 2
3258 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3259 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3260 {
3261 buf[0] = wstr[0];
3262 buf[1] = wstr[1];
3263 wstr += 2;
3264 }
3265 else {
3266 buf[0] = *wstr;
3267 buf[1] = 0;
3268 wstr++;
3269 }
3270#else
3271 buf[0] = *wstr;
3272 wstr++;
3273#endif
3274 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003275 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003276 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003277 }
3278
3279 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003280 return 0;
3281}
3282
Victor Stinner1b579672011-12-17 05:47:23 +01003283static int
3284locale_error_handler(const char *errors, int *surrogateescape)
3285{
3286 if (errors == NULL) {
3287 *surrogateescape = 0;
3288 return 0;
3289 }
3290
3291 if (strcmp(errors, "strict") == 0) {
3292 *surrogateescape = 0;
3293 return 0;
3294 }
3295 if (strcmp(errors, "surrogateescape") == 0) {
3296 *surrogateescape = 1;
3297 return 0;
3298 }
3299 PyErr_Format(PyExc_ValueError,
3300 "only 'strict' and 'surrogateescape' error handlers "
3301 "are supported, not '%s'",
3302 errors);
3303 return -1;
3304}
3305
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003306PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003307PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003308{
3309 Py_ssize_t wlen, wlen2;
3310 wchar_t *wstr;
3311 PyObject *bytes = NULL;
3312 char *errmsg;
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003313 PyObject *reason;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003314 PyObject *exc;
3315 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003316 int surrogateescape;
3317
3318 if (locale_error_handler(errors, &surrogateescape) < 0)
3319 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003320
3321 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3322 if (wstr == NULL)
3323 return NULL;
3324
3325 wlen2 = wcslen(wstr);
3326 if (wlen2 != wlen) {
3327 PyMem_Free(wstr);
3328 PyErr_SetString(PyExc_TypeError, "embedded null character");
3329 return NULL;
3330 }
3331
3332 if (surrogateescape) {
3333 /* locale encoding with surrogateescape */
3334 char *str;
3335
3336 str = _Py_wchar2char(wstr, &error_pos);
3337 if (str == NULL) {
3338 if (error_pos == (size_t)-1) {
3339 PyErr_NoMemory();
3340 PyMem_Free(wstr);
3341 return NULL;
3342 }
3343 else {
3344 goto encode_error;
3345 }
3346 }
3347 PyMem_Free(wstr);
3348
3349 bytes = PyBytes_FromString(str);
3350 PyMem_Free(str);
3351 }
3352 else {
3353 size_t len, len2;
3354
3355 len = wcstombs(NULL, wstr, 0);
3356 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003357 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003358 goto encode_error;
3359 }
3360
3361 bytes = PyBytes_FromStringAndSize(NULL, len);
3362 if (bytes == NULL) {
3363 PyMem_Free(wstr);
3364 return NULL;
3365 }
3366
3367 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3368 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003369 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003370 goto encode_error;
3371 }
3372 PyMem_Free(wstr);
3373 }
3374 return bytes;
3375
3376encode_error:
3377 errmsg = strerror(errno);
3378 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003379
3380 if (error_pos == (size_t)-1)
3381 error_pos = wcstombs_errorpos(wstr);
3382
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003383 PyMem_Free(wstr);
3384 Py_XDECREF(bytes);
3385
Victor Stinner2f197072011-12-17 07:08:30 +01003386 if (errmsg != NULL) {
3387 size_t errlen;
3388 wstr = _Py_char2wchar(errmsg, &errlen);
3389 if (wstr != NULL) {
3390 reason = PyUnicode_FromWideChar(wstr, errlen);
3391 PyMem_Free(wstr);
3392 } else
3393 errmsg = NULL;
3394 }
3395 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003396 reason = PyUnicode_FromString(
3397 "wcstombs() encountered an unencodable "
3398 "wide character");
3399 if (reason == NULL)
3400 return NULL;
3401
3402 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3403 "locale", unicode,
3404 (Py_ssize_t)error_pos,
3405 (Py_ssize_t)(error_pos+1),
3406 reason);
3407 Py_DECREF(reason);
3408 if (exc != NULL) {
3409 PyCodec_StrictErrors(exc);
3410 Py_XDECREF(exc);
3411 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003412 return NULL;
3413}
3414
Victor Stinnerad158722010-10-27 00:25:46 +00003415PyObject *
3416PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003417{
Victor Stinner99b95382011-07-04 14:23:54 +02003418#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003419 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003420#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003421 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003422#else
Victor Stinner793b5312011-04-27 00:24:21 +02003423 PyInterpreterState *interp = PyThreadState_GET()->interp;
3424 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3425 cannot use it to encode and decode filenames before it is loaded. Load
3426 the Python codec requires to encode at least its own filename. Use the C
3427 version of the locale codec until the codec registry is initialized and
3428 the Python codec is loaded.
3429
3430 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3431 cannot only rely on it: check also interp->fscodec_initialized for
3432 subinterpreters. */
3433 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003434 return PyUnicode_AsEncodedString(unicode,
3435 Py_FileSystemDefaultEncoding,
3436 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003437 }
3438 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003439 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003440 }
Victor Stinnerad158722010-10-27 00:25:46 +00003441#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003442}
3443
Alexander Belopolsky40018472011-02-26 01:02:56 +00003444PyObject *
3445PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003446 const char *encoding,
3447 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003448{
3449 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003450 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003451
Guido van Rossumd57fd912000-03-10 22:53:23 +00003452 if (!PyUnicode_Check(unicode)) {
3453 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003454 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003455 }
Fred Drakee4315f52000-05-09 19:53:39 +00003456
Fred Drakee4315f52000-05-09 19:53:39 +00003457 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003458 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003459 if ((strcmp(lower, "utf-8") == 0) ||
3460 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003461 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003462 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003463 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003464 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003465 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003466 }
Victor Stinner37296e82010-06-10 13:36:23 +00003467 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003468 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003469 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003470 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003471#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003472 else if (strcmp(lower, "mbcs") == 0)
3473 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003474#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003475 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003476 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003477 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003478
3479 /* Encode via the codec registry */
3480 v = PyCodec_Encode(unicode, encoding, errors);
3481 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003482 return NULL;
3483
3484 /* The normal path */
3485 if (PyBytes_Check(v))
3486 return v;
3487
3488 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003489 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003490 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003491 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003492
3493 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3494 "encoder %s returned bytearray instead of bytes",
3495 encoding);
3496 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003497 Py_DECREF(v);
3498 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003499 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003500
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003501 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3502 Py_DECREF(v);
3503 return b;
3504 }
3505
3506 PyErr_Format(PyExc_TypeError,
3507 "encoder did not return a bytes object (type=%.400s)",
3508 Py_TYPE(v)->tp_name);
3509 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003510 return NULL;
3511}
3512
Alexander Belopolsky40018472011-02-26 01:02:56 +00003513PyObject *
3514PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003515 const char *encoding,
3516 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003517{
3518 PyObject *v;
3519
3520 if (!PyUnicode_Check(unicode)) {
3521 PyErr_BadArgument();
3522 goto onError;
3523 }
3524
3525 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003526 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003527
3528 /* Encode via the codec registry */
3529 v = PyCodec_Encode(unicode, encoding, errors);
3530 if (v == NULL)
3531 goto onError;
3532 if (!PyUnicode_Check(v)) {
3533 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003534 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003535 Py_TYPE(v)->tp_name);
3536 Py_DECREF(v);
3537 goto onError;
3538 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003539 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003540
Benjamin Peterson29060642009-01-31 22:14:21 +00003541 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003542 return NULL;
3543}
3544
Victor Stinner2f197072011-12-17 07:08:30 +01003545static size_t
3546mbstowcs_errorpos(const char *str, size_t len)
3547{
3548#ifdef HAVE_MBRTOWC
3549 const char *start = str;
3550 mbstate_t mbs;
3551 size_t converted;
3552 wchar_t ch;
3553
3554 memset(&mbs, 0, sizeof mbs);
3555 while (len)
3556 {
3557 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3558 if (converted == 0)
3559 /* Reached end of string */
3560 break;
3561 if (converted == (size_t)-1 || converted == (size_t)-2) {
3562 /* Conversion error or incomplete character */
3563 return str - start;
3564 }
3565 else {
3566 str += converted;
3567 len -= converted;
3568 }
3569 }
3570 /* failed to find the undecodable byte sequence */
3571 return 0;
3572#endif
3573 return 0;
3574}
3575
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003576PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003577PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003578 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003579{
3580 wchar_t smallbuf[256];
3581 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3582 wchar_t *wstr;
3583 size_t wlen, wlen2;
3584 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003585 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003586 size_t error_pos;
3587 char *errmsg;
3588 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003589
3590 if (locale_error_handler(errors, &surrogateescape) < 0)
3591 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003592
3593 if (str[len] != '\0' || len != strlen(str)) {
3594 PyErr_SetString(PyExc_TypeError, "embedded null character");
3595 return NULL;
3596 }
3597
3598 if (surrogateescape)
3599 {
3600 wstr = _Py_char2wchar(str, &wlen);
3601 if (wstr == NULL) {
3602 if (wlen == (size_t)-1)
3603 PyErr_NoMemory();
3604 else
3605 PyErr_SetFromErrno(PyExc_OSError);
3606 return NULL;
3607 }
3608
3609 unicode = PyUnicode_FromWideChar(wstr, wlen);
3610 PyMem_Free(wstr);
3611 }
3612 else {
3613#ifndef HAVE_BROKEN_MBSTOWCS
3614 wlen = mbstowcs(NULL, str, 0);
3615#else
3616 wlen = len;
3617#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003618 if (wlen == (size_t)-1)
3619 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003620 if (wlen+1 <= smallbuf_len) {
3621 wstr = smallbuf;
3622 }
3623 else {
3624 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3625 return PyErr_NoMemory();
3626
3627 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3628 if (!wstr)
3629 return PyErr_NoMemory();
3630 }
3631
3632 /* This shouldn't fail now */
3633 wlen2 = mbstowcs(wstr, str, wlen+1);
3634 if (wlen2 == (size_t)-1) {
3635 if (wstr != smallbuf)
3636 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003637 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003638 }
3639#ifdef HAVE_BROKEN_MBSTOWCS
3640 assert(wlen2 == wlen);
3641#endif
3642 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3643 if (wstr != smallbuf)
3644 PyMem_Free(wstr);
3645 }
3646 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003647
3648decode_error:
3649 errmsg = strerror(errno);
3650 assert(errmsg != NULL);
3651
3652 error_pos = mbstowcs_errorpos(str, len);
3653 if (errmsg != NULL) {
3654 size_t errlen;
3655 wstr = _Py_char2wchar(errmsg, &errlen);
3656 if (wstr != NULL) {
3657 reason = PyUnicode_FromWideChar(wstr, errlen);
3658 PyMem_Free(wstr);
3659 } else
3660 errmsg = NULL;
3661 }
3662 if (errmsg == NULL)
3663 reason = PyUnicode_FromString(
3664 "mbstowcs() encountered an invalid multibyte sequence");
3665 if (reason == NULL)
3666 return NULL;
3667
3668 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3669 "locale", str, len,
3670 (Py_ssize_t)error_pos,
3671 (Py_ssize_t)(error_pos+1),
3672 reason);
3673 Py_DECREF(reason);
3674 if (exc != NULL) {
3675 PyCodec_StrictErrors(exc);
3676 Py_XDECREF(exc);
3677 }
3678 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003679}
3680
3681PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003682PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003683{
3684 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003685 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003686}
3687
3688
3689PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003690PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003691 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003692 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3693}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003694
Christian Heimes5894ba72007-11-04 11:43:14 +00003695PyObject*
3696PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3697{
Victor Stinner99b95382011-07-04 14:23:54 +02003698#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003699 return PyUnicode_DecodeMBCS(s, size, NULL);
3700#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003701 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003702#else
Victor Stinner793b5312011-04-27 00:24:21 +02003703 PyInterpreterState *interp = PyThreadState_GET()->interp;
3704 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3705 cannot use it to encode and decode filenames before it is loaded. Load
3706 the Python codec requires to encode at least its own filename. Use the C
3707 version of the locale codec until the codec registry is initialized and
3708 the Python codec is loaded.
3709
3710 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3711 cannot only rely on it: check also interp->fscodec_initialized for
3712 subinterpreters. */
3713 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003714 return PyUnicode_Decode(s, size,
3715 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003716 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003717 }
3718 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003719 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003720 }
Victor Stinnerad158722010-10-27 00:25:46 +00003721#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003722}
3723
Martin v. Löwis011e8422009-05-05 04:43:17 +00003724
3725int
Antoine Pitrou13348842012-01-29 18:36:34 +01003726_PyUnicode_HasNULChars(PyObject* s)
3727{
3728 static PyObject *nul = NULL;
3729
3730 if (nul == NULL)
3731 nul = PyUnicode_FromStringAndSize("\0", 1);
3732 if (nul == NULL)
3733 return -1;
3734 return PyUnicode_Contains(s, nul);
3735}
3736
3737
3738int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003739PyUnicode_FSConverter(PyObject* arg, void* addr)
3740{
3741 PyObject *output = NULL;
3742 Py_ssize_t size;
3743 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003744 if (arg == NULL) {
3745 Py_DECREF(*(PyObject**)addr);
3746 return 1;
3747 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003748 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003749 output = arg;
3750 Py_INCREF(output);
3751 }
3752 else {
3753 arg = PyUnicode_FromObject(arg);
3754 if (!arg)
3755 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003756 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003757 Py_DECREF(arg);
3758 if (!output)
3759 return 0;
3760 if (!PyBytes_Check(output)) {
3761 Py_DECREF(output);
3762 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3763 return 0;
3764 }
3765 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003766 size = PyBytes_GET_SIZE(output);
3767 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003768 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003769 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003770 Py_DECREF(output);
3771 return 0;
3772 }
3773 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003774 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003775}
3776
3777
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003778int
3779PyUnicode_FSDecoder(PyObject* arg, void* addr)
3780{
3781 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003782 if (arg == NULL) {
3783 Py_DECREF(*(PyObject**)addr);
3784 return 1;
3785 }
3786 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003787 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003788 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003789 output = arg;
3790 Py_INCREF(output);
3791 }
3792 else {
3793 arg = PyBytes_FromObject(arg);
3794 if (!arg)
3795 return 0;
3796 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3797 PyBytes_GET_SIZE(arg));
3798 Py_DECREF(arg);
3799 if (!output)
3800 return 0;
3801 if (!PyUnicode_Check(output)) {
3802 Py_DECREF(output);
3803 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3804 return 0;
3805 }
3806 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003807 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003808 Py_DECREF(output);
3809 return 0;
3810 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003811 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003812 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003813 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3814 Py_DECREF(output);
3815 return 0;
3816 }
3817 *(PyObject**)addr = output;
3818 return Py_CLEANUP_SUPPORTED;
3819}
3820
3821
Martin v. Löwis5b222132007-06-10 09:51:05 +00003822char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003823PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003824{
Christian Heimesf3863112007-11-22 07:46:41 +00003825 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003826
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003827 if (!PyUnicode_Check(unicode)) {
3828 PyErr_BadArgument();
3829 return NULL;
3830 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003831 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003832 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003833
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003834 if (PyUnicode_UTF8(unicode) == NULL) {
3835 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003836 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3837 if (bytes == NULL)
3838 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003839 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3840 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003841 Py_DECREF(bytes);
3842 return NULL;
3843 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003844 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3845 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3846 PyBytes_AS_STRING(bytes),
3847 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003848 Py_DECREF(bytes);
3849 }
3850
3851 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003852 *psize = PyUnicode_UTF8_LENGTH(unicode);
3853 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003854}
3855
3856char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003857PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003858{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003859 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3860}
3861
3862#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02003863static int unicode_as_unicode_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003864#endif
3865
3866
3867Py_UNICODE *
3868PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3869{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003870 const unsigned char *one_byte;
3871#if SIZEOF_WCHAR_T == 4
3872 const Py_UCS2 *two_bytes;
3873#else
3874 const Py_UCS4 *four_bytes;
3875 const Py_UCS4 *ucs4_end;
3876 Py_ssize_t num_surrogates;
3877#endif
3878 wchar_t *w;
3879 wchar_t *wchar_end;
3880
3881 if (!PyUnicode_Check(unicode)) {
3882 PyErr_BadArgument();
3883 return NULL;
3884 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003885 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003886 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003887 assert(_PyUnicode_KIND(unicode) != 0);
3888 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003889
3890#ifdef Py_DEBUG
3891 ++unicode_as_unicode_calls;
3892#endif
3893
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003894 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003895#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003896 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3897 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003898 num_surrogates = 0;
3899
3900 for (; four_bytes < ucs4_end; ++four_bytes) {
3901 if (*four_bytes > 0xFFFF)
3902 ++num_surrogates;
3903 }
3904
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003905 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3906 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3907 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003908 PyErr_NoMemory();
3909 return NULL;
3910 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003911 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003912
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003913 w = _PyUnicode_WSTR(unicode);
3914 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3915 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003916 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3917 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003918 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003919 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003920 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3921 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003922 }
3923 else
3924 *w = *four_bytes;
3925
3926 if (w > wchar_end) {
3927 assert(0 && "Miscalculated string end");
3928 }
3929 }
3930 *w = 0;
3931#else
3932 /* sizeof(wchar_t) == 4 */
3933 Py_FatalError("Impossible unicode object state, wstr and str "
3934 "should share memory already.");
3935 return NULL;
3936#endif
3937 }
3938 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003939 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3940 (_PyUnicode_LENGTH(unicode) + 1));
3941 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003942 PyErr_NoMemory();
3943 return NULL;
3944 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003945 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3946 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3947 w = _PyUnicode_WSTR(unicode);
3948 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003949
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003950 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3951 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003952 for (; w < wchar_end; ++one_byte, ++w)
3953 *w = *one_byte;
3954 /* null-terminate the wstr */
3955 *w = 0;
3956 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003957 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003958#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003959 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003960 for (; w < wchar_end; ++two_bytes, ++w)
3961 *w = *two_bytes;
3962 /* null-terminate the wstr */
3963 *w = 0;
3964#else
3965 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003966 PyObject_FREE(_PyUnicode_WSTR(unicode));
3967 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003968 Py_FatalError("Impossible unicode object state, wstr "
3969 "and str should share memory already.");
3970 return NULL;
3971#endif
3972 }
3973 else {
3974 assert(0 && "This should never happen.");
3975 }
3976 }
3977 }
3978 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003979 *size = PyUnicode_WSTR_LENGTH(unicode);
3980 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003981}
3982
Alexander Belopolsky40018472011-02-26 01:02:56 +00003983Py_UNICODE *
3984PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003985{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003986 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003987}
3988
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003989
Alexander Belopolsky40018472011-02-26 01:02:56 +00003990Py_ssize_t
3991PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003992{
3993 if (!PyUnicode_Check(unicode)) {
3994 PyErr_BadArgument();
3995 goto onError;
3996 }
3997 return PyUnicode_GET_SIZE(unicode);
3998
Benjamin Peterson29060642009-01-31 22:14:21 +00003999 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004000 return -1;
4001}
4002
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004003Py_ssize_t
4004PyUnicode_GetLength(PyObject *unicode)
4005{
Victor Stinner5a706cf2011-10-02 00:36:53 +02004006 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004007 PyErr_BadArgument();
4008 return -1;
4009 }
4010
4011 return PyUnicode_GET_LENGTH(unicode);
4012}
4013
4014Py_UCS4
4015PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4016{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004017 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
4018 PyErr_BadArgument();
4019 return (Py_UCS4)-1;
4020 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004021 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004022 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004023 return (Py_UCS4)-1;
4024 }
4025 return PyUnicode_READ_CHAR(unicode, index);
4026}
4027
4028int
4029PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4030{
4031 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004032 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004033 return -1;
4034 }
Victor Stinner488fa492011-12-12 00:01:39 +01004035 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004036 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004037 PyErr_SetString(PyExc_IndexError, "string index out of range");
4038 return -1;
4039 }
Victor Stinner488fa492011-12-12 00:01:39 +01004040 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004041 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004042 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4043 PyErr_SetString(PyExc_ValueError, "character out of range");
4044 return -1;
4045 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004046 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4047 index, ch);
4048 return 0;
4049}
4050
Alexander Belopolsky40018472011-02-26 01:02:56 +00004051const char *
4052PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004053{
Victor Stinner42cb4622010-09-01 19:39:01 +00004054 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004055}
4056
Victor Stinner554f3f02010-06-16 23:33:54 +00004057/* create or adjust a UnicodeDecodeError */
4058static void
4059make_decode_exception(PyObject **exceptionObject,
4060 const char *encoding,
4061 const char *input, Py_ssize_t length,
4062 Py_ssize_t startpos, Py_ssize_t endpos,
4063 const char *reason)
4064{
4065 if (*exceptionObject == NULL) {
4066 *exceptionObject = PyUnicodeDecodeError_Create(
4067 encoding, input, length, startpos, endpos, reason);
4068 }
4069 else {
4070 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4071 goto onError;
4072 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4073 goto onError;
4074 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4075 goto onError;
4076 }
4077 return;
4078
4079onError:
4080 Py_DECREF(*exceptionObject);
4081 *exceptionObject = NULL;
4082}
4083
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004084/* error handling callback helper:
4085 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004086 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004087 and adjust various state variables.
4088 return 0 on success, -1 on error
4089*/
4090
Alexander Belopolsky40018472011-02-26 01:02:56 +00004091static int
4092unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004093 const char *encoding, const char *reason,
4094 const char **input, const char **inend, Py_ssize_t *startinpos,
4095 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004096 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004097{
Benjamin Peterson142957c2008-07-04 19:55:29 +00004098 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004099
4100 PyObject *restuple = NULL;
4101 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004102 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004103 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004104 Py_ssize_t requiredsize;
4105 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004106 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004107 int res = -1;
4108
Victor Stinner596a6c42011-11-09 00:02:18 +01004109 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
4110 outsize = PyUnicode_GET_LENGTH(*output);
4111 else
4112 outsize = _PyUnicode_WSTR_LENGTH(*output);
4113
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004114 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004115 *errorHandler = PyCodec_LookupError(errors);
4116 if (*errorHandler == NULL)
4117 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004118 }
4119
Victor Stinner554f3f02010-06-16 23:33:54 +00004120 make_decode_exception(exceptionObject,
4121 encoding,
4122 *input, *inend - *input,
4123 *startinpos, *endinpos,
4124 reason);
4125 if (*exceptionObject == NULL)
4126 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004127
4128 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4129 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004130 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004131 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004132 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004133 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004134 }
4135 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004136 goto onError;
Benjamin Petersonbac79492012-01-14 13:34:47 -05004137 if (PyUnicode_READY(repunicode) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004138 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004139
4140 /* Copy back the bytes variables, which might have been modified by the
4141 callback */
4142 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4143 if (!inputobj)
4144 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004145 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004146 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004147 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004148 *input = PyBytes_AS_STRING(inputobj);
4149 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004150 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004151 /* we can DECREF safely, as the exception has another reference,
4152 so the object won't go away. */
4153 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004154
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004155 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004156 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004157 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004158 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4159 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004160 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004161
Victor Stinner596a6c42011-11-09 00:02:18 +01004162 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
4163 /* need more space? (at least enough for what we
4164 have+the replacement+the rest of the string (starting
4165 at the new input position), so we won't have to check space
4166 when there are no errors in the rest of the string) */
4167 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
4168 requiredsize = *outpos + replen + insize-newpos;
4169 if (requiredsize > outsize) {
4170 if (requiredsize<2*outsize)
4171 requiredsize = 2*outsize;
4172 if (unicode_resize(output, requiredsize) < 0)
4173 goto onError;
4174 }
Victor Stinner1b487b42012-05-03 12:29:04 +02004175 if (unicode_widen(output, *outpos,
4176 PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004177 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +02004178 _PyUnicode_FastCopyCharacters(*output, *outpos, repunicode, 0, replen);
Victor Stinner596a6c42011-11-09 00:02:18 +01004179 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004180 }
Victor Stinner596a6c42011-11-09 00:02:18 +01004181 else {
4182 wchar_t *repwstr;
4183 Py_ssize_t repwlen;
4184 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4185 if (repwstr == NULL)
4186 goto onError;
4187 /* need more space? (at least enough for what we
4188 have+the replacement+the rest of the string (starting
4189 at the new input position), so we won't have to check space
4190 when there are no errors in the rest of the string) */
4191 requiredsize = *outpos + repwlen + insize-newpos;
4192 if (requiredsize > outsize) {
4193 if (requiredsize < 2*outsize)
4194 requiredsize = 2*outsize;
4195 if (unicode_resize(output, requiredsize) < 0)
4196 goto onError;
4197 }
4198 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4199 *outpos += repwlen;
4200 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004201 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004202 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004203
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004204 /* we made it! */
4205 res = 0;
4206
Benjamin Peterson29060642009-01-31 22:14:21 +00004207 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004208 Py_XDECREF(restuple);
4209 return res;
4210}
4211
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004212/* --- UTF-7 Codec -------------------------------------------------------- */
4213
Antoine Pitrou244651a2009-05-04 18:56:13 +00004214/* See RFC2152 for details. We encode conservatively and decode liberally. */
4215
4216/* Three simple macros defining base-64. */
4217
4218/* Is c a base-64 character? */
4219
4220#define IS_BASE64(c) \
4221 (((c) >= 'A' && (c) <= 'Z') || \
4222 ((c) >= 'a' && (c) <= 'z') || \
4223 ((c) >= '0' && (c) <= '9') || \
4224 (c) == '+' || (c) == '/')
4225
4226/* given that c is a base-64 character, what is its base-64 value? */
4227
4228#define FROM_BASE64(c) \
4229 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4230 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4231 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4232 (c) == '+' ? 62 : 63)
4233
4234/* What is the base-64 character of the bottom 6 bits of n? */
4235
4236#define TO_BASE64(n) \
4237 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4238
4239/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4240 * decoded as itself. We are permissive on decoding; the only ASCII
4241 * byte not decoding to itself is the + which begins a base64
4242 * string. */
4243
4244#define DECODE_DIRECT(c) \
4245 ((c) <= 127 && (c) != '+')
4246
4247/* The UTF-7 encoder treats ASCII characters differently according to
4248 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4249 * the above). See RFC2152. This array identifies these different
4250 * sets:
4251 * 0 : "Set D"
4252 * alphanumeric and '(),-./:?
4253 * 1 : "Set O"
4254 * !"#$%&*;<=>@[]^_`{|}
4255 * 2 : "whitespace"
4256 * ht nl cr sp
4257 * 3 : special (must be base64 encoded)
4258 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4259 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004260
Tim Petersced69f82003-09-16 20:30:58 +00004261static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004262char utf7_category[128] = {
4263/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4264 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4265/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4266 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4267/* sp ! " # $ % & ' ( ) * + , - . / */
4268 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4269/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4270 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4271/* @ A B C D E F G H I J K L M N O */
4272 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4273/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4274 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4275/* ` a b c d e f g h i j k l m n o */
4276 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4277/* p q r s t u v w x y z { | } ~ del */
4278 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004279};
4280
Antoine Pitrou244651a2009-05-04 18:56:13 +00004281/* ENCODE_DIRECT: this character should be encoded as itself. The
4282 * answer depends on whether we are encoding set O as itself, and also
4283 * on whether we are encoding whitespace as itself. RFC2152 makes it
4284 * clear that the answers to these questions vary between
4285 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004286
Antoine Pitrou244651a2009-05-04 18:56:13 +00004287#define ENCODE_DIRECT(c, directO, directWS) \
4288 ((c) < 128 && (c) > 0 && \
4289 ((utf7_category[(c)] == 0) || \
4290 (directWS && (utf7_category[(c)] == 2)) || \
4291 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004292
Alexander Belopolsky40018472011-02-26 01:02:56 +00004293PyObject *
4294PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004295 Py_ssize_t size,
4296 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004297{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004298 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4299}
4300
Antoine Pitrou244651a2009-05-04 18:56:13 +00004301/* The decoder. The only state we preserve is our read position,
4302 * i.e. how many characters we have consumed. So if we end in the
4303 * middle of a shift sequence we have to back off the read position
4304 * and the output to the beginning of the sequence, otherwise we lose
4305 * all the shift state (seen bits, number of bits seen, high
4306 * surrogate). */
4307
Alexander Belopolsky40018472011-02-26 01:02:56 +00004308PyObject *
4309PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004310 Py_ssize_t size,
4311 const char *errors,
4312 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004313{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004314 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004315 Py_ssize_t startinpos;
4316 Py_ssize_t endinpos;
4317 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004318 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004319 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004320 const char *errmsg = "";
4321 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004322 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004323 unsigned int base64bits = 0;
4324 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004325 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004326 PyObject *errorHandler = NULL;
4327 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004328
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004329 /* Start off assuming it's all ASCII. Widen later as necessary. */
4330 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004331 if (!unicode)
4332 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004333 if (size == 0) {
4334 if (consumed)
4335 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004336 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004337 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004338
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004339 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004340 e = s + size;
4341
4342 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004343 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004344 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004345 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004346
Antoine Pitrou244651a2009-05-04 18:56:13 +00004347 if (inShift) { /* in a base-64 section */
4348 if (IS_BASE64(ch)) { /* consume a base-64 character */
4349 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4350 base64bits += 6;
4351 s++;
4352 if (base64bits >= 16) {
4353 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004354 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004355 base64bits -= 16;
4356 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4357 if (surrogate) {
4358 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004359 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4360 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004361 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
4362 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004363 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004364 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004365 }
4366 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004367 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4368 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004369 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004370 }
4371 }
Victor Stinner551ac952011-11-29 22:58:13 +01004372 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004373 /* first surrogate */
4374 surrogate = outCh;
4375 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004376 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004377 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
4378 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004379 }
4380 }
4381 }
4382 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004383 inShift = 0;
4384 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004385 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004386 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4387 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004388 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004389 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004390 if (base64bits > 0) { /* left-over bits */
4391 if (base64bits >= 6) {
4392 /* We've seen at least one base-64 character */
4393 errmsg = "partial character in shift sequence";
4394 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004395 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004396 else {
4397 /* Some bits remain; they should be zero */
4398 if (base64buffer != 0) {
4399 errmsg = "non-zero padding bits in shift sequence";
4400 goto utf7Error;
4401 }
4402 }
4403 }
4404 if (ch != '-') {
4405 /* '-' is absorbed; other terminating
4406 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004407 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4408 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004409 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004410 }
4411 }
4412 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004413 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004414 s++; /* consume '+' */
4415 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004416 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004417 if (unicode_putchar(&unicode, &outpos, '+') < 0)
4418 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004419 }
4420 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004421 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004422 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004423 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004424 }
4425 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004426 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004427 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4428 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004429 s++;
4430 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004431 else {
4432 startinpos = s-starts;
4433 s++;
4434 errmsg = "unexpected special character";
4435 goto utf7Error;
4436 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004437 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004438utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004439 endinpos = s-starts;
4440 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00004441 errors, &errorHandler,
4442 "utf7", errmsg,
4443 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004444 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004445 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004446 }
4447
Antoine Pitrou244651a2009-05-04 18:56:13 +00004448 /* end of string */
4449
4450 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4451 /* if we're in an inconsistent state, that's an error */
4452 if (surrogate ||
4453 (base64bits >= 6) ||
4454 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004455 endinpos = size;
4456 if (unicode_decode_call_errorhandler(
4457 errors, &errorHandler,
4458 "utf7", "unterminated shift sequence",
4459 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004460 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004461 goto onError;
4462 if (s < e)
4463 goto restart;
4464 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004465 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004466
4467 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004468 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004469 if (inShift) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004470 outpos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004471 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004472 }
4473 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004474 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004475 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004476 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004477
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004478 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004479 goto onError;
4480
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004481 Py_XDECREF(errorHandler);
4482 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004483 return unicode_result(unicode);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004484
Benjamin Peterson29060642009-01-31 22:14:21 +00004485 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004486 Py_XDECREF(errorHandler);
4487 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004488 Py_DECREF(unicode);
4489 return NULL;
4490}
4491
4492
Alexander Belopolsky40018472011-02-26 01:02:56 +00004493PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004494_PyUnicode_EncodeUTF7(PyObject *str,
4495 int base64SetO,
4496 int base64WhiteSpace,
4497 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004498{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004499 int kind;
4500 void *data;
4501 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004502 PyObject *v;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004503 Py_ssize_t allocated;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004504 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004505 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004506 unsigned int base64bits = 0;
4507 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004508 char * out;
4509 char * start;
4510
Benjamin Petersonbac79492012-01-14 13:34:47 -05004511 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004512 return NULL;
4513 kind = PyUnicode_KIND(str);
4514 data = PyUnicode_DATA(str);
4515 len = PyUnicode_GET_LENGTH(str);
4516
4517 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004518 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004519
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004520 /* It might be possible to tighten this worst case */
4521 allocated = 8 * len;
4522 if (allocated / 8 != len)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004523 return PyErr_NoMemory();
4524
Antoine Pitrou244651a2009-05-04 18:56:13 +00004525 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004526 if (v == NULL)
4527 return NULL;
4528
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004529 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004530 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004531 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004532
Antoine Pitrou244651a2009-05-04 18:56:13 +00004533 if (inShift) {
4534 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4535 /* shifting out */
4536 if (base64bits) { /* output remaining bits */
4537 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4538 base64buffer = 0;
4539 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004540 }
4541 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004542 /* Characters not in the BASE64 set implicitly unshift the sequence
4543 so no '-' is required, except if the character is itself a '-' */
4544 if (IS_BASE64(ch) || ch == '-') {
4545 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004546 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004547 *out++ = (char) ch;
4548 }
4549 else {
4550 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004551 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004552 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004553 else { /* not in a shift sequence */
4554 if (ch == '+') {
4555 *out++ = '+';
4556 *out++ = '-';
4557 }
4558 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4559 *out++ = (char) ch;
4560 }
4561 else {
4562 *out++ = '+';
4563 inShift = 1;
4564 goto encode_char;
4565 }
4566 }
4567 continue;
4568encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004569 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004570 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004571
Antoine Pitrou244651a2009-05-04 18:56:13 +00004572 /* code first surrogate */
4573 base64bits += 16;
4574 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4575 while (base64bits >= 6) {
4576 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4577 base64bits -= 6;
4578 }
4579 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004580 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004581 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004582 base64bits += 16;
4583 base64buffer = (base64buffer << 16) | ch;
4584 while (base64bits >= 6) {
4585 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4586 base64bits -= 6;
4587 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004588 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004589 if (base64bits)
4590 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4591 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004592 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004593 if (_PyBytes_Resize(&v, out - start) < 0)
4594 return NULL;
4595 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004596}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004597PyObject *
4598PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4599 Py_ssize_t size,
4600 int base64SetO,
4601 int base64WhiteSpace,
4602 const char *errors)
4603{
4604 PyObject *result;
4605 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4606 if (tmp == NULL)
4607 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004608 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004609 base64WhiteSpace, errors);
4610 Py_DECREF(tmp);
4611 return result;
4612}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004613
Antoine Pitrou244651a2009-05-04 18:56:13 +00004614#undef IS_BASE64
4615#undef FROM_BASE64
4616#undef TO_BASE64
4617#undef DECODE_DIRECT
4618#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004619
Guido van Rossumd57fd912000-03-10 22:53:23 +00004620/* --- UTF-8 Codec -------------------------------------------------------- */
4621
Alexander Belopolsky40018472011-02-26 01:02:56 +00004622PyObject *
4623PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004624 Py_ssize_t size,
4625 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004626{
Walter Dörwald69652032004-09-07 20:24:22 +00004627 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4628}
4629
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004630#include "stringlib/asciilib.h"
4631#include "stringlib/codecs.h"
4632#include "stringlib/undef.h"
4633
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004634#include "stringlib/ucs1lib.h"
4635#include "stringlib/codecs.h"
4636#include "stringlib/undef.h"
4637
4638#include "stringlib/ucs2lib.h"
4639#include "stringlib/codecs.h"
4640#include "stringlib/undef.h"
4641
4642#include "stringlib/ucs4lib.h"
4643#include "stringlib/codecs.h"
4644#include "stringlib/undef.h"
4645
Antoine Pitrouab868312009-01-10 15:40:25 +00004646/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4647#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4648
4649/* Mask to quickly check whether a C 'long' contains a
4650 non-ASCII, UTF8-encoded char. */
4651#if (SIZEOF_LONG == 8)
4652# define ASCII_CHAR_MASK 0x8080808080808080L
4653#elif (SIZEOF_LONG == 4)
4654# define ASCII_CHAR_MASK 0x80808080L
4655#else
4656# error C 'long' size should be either 4 or 8!
4657#endif
4658
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004659static Py_ssize_t
4660ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004661{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004662 const char *p = start;
4663 const char *aligned_end = (const char *) ((size_t) end & ~LONG_PTR_MASK);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004664
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004665#if SIZEOF_LONG <= SIZEOF_VOID_P
4666 assert(!((size_t) dest & LONG_PTR_MASK));
4667 if (!((size_t) p & LONG_PTR_MASK)) {
4668 /* Fast path, see in STRINGLIB(utf8_decode) for
4669 an explanation. */
4670 /* Help register allocation */
4671 register const char *_p = p;
4672 register Py_UCS1 * q = dest;
4673 while (_p < aligned_end) {
4674 unsigned long value = *(const unsigned long *) _p;
4675 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004676 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004677 *((unsigned long *)q) = value;
4678 _p += SIZEOF_LONG;
4679 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004680 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004681 p = _p;
4682 while (p < end) {
4683 if ((unsigned char)*p & 0x80)
4684 break;
4685 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004686 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004687 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004688 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004689#endif
4690 while (p < end) {
4691 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4692 for an explanation. */
4693 if (!((size_t) p & LONG_PTR_MASK)) {
4694 /* Help register allocation */
4695 register const char *_p = p;
4696 while (_p < aligned_end) {
4697 unsigned long value = *(unsigned long *) _p;
4698 if (value & ASCII_CHAR_MASK)
4699 break;
4700 _p += SIZEOF_LONG;
4701 }
4702 p = _p;
4703 if (_p == end)
4704 break;
4705 }
4706 if ((unsigned char)*p & 0x80)
4707 break;
4708 ++p;
4709 }
4710 memcpy(dest, start, p - start);
4711 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004712}
Antoine Pitrouab868312009-01-10 15:40:25 +00004713
Victor Stinner785938e2011-12-11 20:09:03 +01004714PyObject *
4715PyUnicode_DecodeUTF8Stateful(const char *s,
4716 Py_ssize_t size,
4717 const char *errors,
4718 Py_ssize_t *consumed)
4719{
Victor Stinner785938e2011-12-11 20:09:03 +01004720 PyObject *unicode;
Victor Stinner785938e2011-12-11 20:09:03 +01004721 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004722 const char *end = s + size;
4723 Py_ssize_t outpos;
4724
4725 Py_ssize_t startinpos;
4726 Py_ssize_t endinpos;
4727 const char *errmsg = "";
4728 PyObject *errorHandler = NULL;
4729 PyObject *exc = NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004730
4731 if (size == 0) {
4732 if (consumed)
4733 *consumed = 0;
Victor Stinner382955f2011-12-11 21:44:00 +01004734 Py_INCREF(unicode_empty);
4735 return unicode_empty;
Victor Stinner785938e2011-12-11 20:09:03 +01004736 }
4737
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004738 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4739 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004740 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004741 *consumed = 1;
4742 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004743 }
4744
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004745 unicode = PyUnicode_New(size, 127);
Victor Stinner785938e2011-12-11 20:09:03 +01004746 if (!unicode)
4747 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004748
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004749 outpos = ascii_decode(s, end, PyUnicode_1BYTE_DATA(unicode));
4750 s += outpos;
4751 while (s < end) {
4752 Py_UCS4 ch;
4753 int kind = PyUnicode_KIND(unicode);
4754 if (kind == PyUnicode_1BYTE_KIND) {
4755 if (PyUnicode_IS_ASCII(unicode))
4756 ch = asciilib_utf8_decode(&s, end,
4757 PyUnicode_1BYTE_DATA(unicode), &outpos);
4758 else
4759 ch = ucs1lib_utf8_decode(&s, end,
4760 PyUnicode_1BYTE_DATA(unicode), &outpos);
4761 } else if (kind == PyUnicode_2BYTE_KIND) {
4762 ch = ucs2lib_utf8_decode(&s, end,
4763 PyUnicode_2BYTE_DATA(unicode), &outpos);
4764 } else {
4765 assert(kind == PyUnicode_4BYTE_KIND);
4766 ch = ucs4lib_utf8_decode(&s, end,
4767 PyUnicode_4BYTE_DATA(unicode), &outpos);
4768 }
4769
4770 switch (ch) {
4771 case 0:
4772 if (s == end || consumed)
4773 goto End;
4774 errmsg = "unexpected end of data";
4775 startinpos = s - starts;
4776 endinpos = startinpos + 1;
4777 while (endinpos < size && (starts[endinpos] & 0xC0) == 0x80)
4778 endinpos++;
4779 break;
4780 case 1:
4781 errmsg = "invalid start byte";
4782 startinpos = s - starts;
4783 endinpos = startinpos + 1;
4784 break;
4785 case 2:
4786 errmsg = "invalid continuation byte";
4787 startinpos = s - starts;
4788 endinpos = startinpos + 1;
4789 while (endinpos < size && (starts[endinpos] & 0xC0) == 0x80)
4790 endinpos++;
4791 break;
4792 default:
4793 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4794 goto onError;
4795 continue;
4796 }
4797
4798 if (unicode_decode_call_errorhandler(
4799 errors, &errorHandler,
4800 "utf-8", errmsg,
4801 &starts, &end, &startinpos, &endinpos, &exc, &s,
4802 &unicode, &outpos))
4803 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004804 }
4805
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004806End:
4807 if (unicode_resize(&unicode, outpos) < 0)
4808 goto onError;
4809
4810 if (consumed)
4811 *consumed = s - starts;
4812
4813 Py_XDECREF(errorHandler);
4814 Py_XDECREF(exc);
4815 assert(_PyUnicode_CheckConsistency(unicode, 1));
4816 return unicode;
4817
4818onError:
4819 Py_XDECREF(errorHandler);
4820 Py_XDECREF(exc);
4821 Py_XDECREF(unicode);
4822 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004823}
4824
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004825#ifdef __APPLE__
4826
4827/* Simplified UTF-8 decoder using surrogateescape error handler,
4828 used to decode the command line arguments on Mac OS X. */
4829
4830wchar_t*
4831_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4832{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004833 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004834 wchar_t *unicode;
4835 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004836
4837 /* Note: size will always be longer than the resulting Unicode
4838 character count */
4839 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4840 PyErr_NoMemory();
4841 return NULL;
4842 }
4843 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4844 if (!unicode)
4845 return NULL;
4846
4847 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004848 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004849 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004850 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004851 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004852#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004853 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004854#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004855 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004856#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004857 if (ch > 0xFF) {
4858#if SIZEOF_WCHAR_T == 4
4859 assert(0);
4860#else
4861 assert(Py_UNICODE_IS_SURROGATE(ch));
4862 /* compute and append the two surrogates: */
4863 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4864 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4865#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004866 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004867 else {
4868 if (!ch && s == e)
4869 break;
4870 /* surrogateescape */
4871 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4872 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004873 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004874 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004875 return unicode;
4876}
4877
4878#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004879
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004880/* Primary internal function which creates utf8 encoded bytes objects.
4881
4882 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004883 and allocate exactly as much space needed at the end. Else allocate the
4884 maximum possible needed (4 result bytes per Unicode character), and return
4885 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004886*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004887PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004888_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004889{
Victor Stinner6099a032011-12-18 14:22:26 +01004890 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004891 void *data;
4892 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004893
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004894 if (!PyUnicode_Check(unicode)) {
4895 PyErr_BadArgument();
4896 return NULL;
4897 }
4898
4899 if (PyUnicode_READY(unicode) == -1)
4900 return NULL;
4901
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004902 if (PyUnicode_UTF8(unicode))
4903 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4904 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004905
4906 kind = PyUnicode_KIND(unicode);
4907 data = PyUnicode_DATA(unicode);
4908 size = PyUnicode_GET_LENGTH(unicode);
4909
Benjamin Petersonead6b532011-12-20 17:23:42 -06004910 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01004911 default:
4912 assert(0);
4913 case PyUnicode_1BYTE_KIND:
4914 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4915 assert(!PyUnicode_IS_ASCII(unicode));
4916 return ucs1lib_utf8_encoder(unicode, data, size, errors);
4917 case PyUnicode_2BYTE_KIND:
4918 return ucs2lib_utf8_encoder(unicode, data, size, errors);
4919 case PyUnicode_4BYTE_KIND:
4920 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00004921 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004922}
4923
Alexander Belopolsky40018472011-02-26 01:02:56 +00004924PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004925PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4926 Py_ssize_t size,
4927 const char *errors)
4928{
4929 PyObject *v, *unicode;
4930
4931 unicode = PyUnicode_FromUnicode(s, size);
4932 if (unicode == NULL)
4933 return NULL;
4934 v = _PyUnicode_AsUTF8String(unicode, errors);
4935 Py_DECREF(unicode);
4936 return v;
4937}
4938
4939PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004940PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004941{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004942 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004943}
4944
Walter Dörwald41980ca2007-08-16 21:55:45 +00004945/* --- UTF-32 Codec ------------------------------------------------------- */
4946
4947PyObject *
4948PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004949 Py_ssize_t size,
4950 const char *errors,
4951 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004952{
4953 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4954}
4955
4956PyObject *
4957PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004958 Py_ssize_t size,
4959 const char *errors,
4960 int *byteorder,
4961 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004962{
4963 const char *starts = s;
4964 Py_ssize_t startinpos;
4965 Py_ssize_t endinpos;
4966 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004967 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004968 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004969 int bo = 0; /* assume native ordering by default */
4970 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004971 /* Offsets from q for retrieving bytes in the right order. */
4972#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4973 int iorder[] = {0, 1, 2, 3};
4974#else
4975 int iorder[] = {3, 2, 1, 0};
4976#endif
4977 PyObject *errorHandler = NULL;
4978 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004979
Walter Dörwald41980ca2007-08-16 21:55:45 +00004980 q = (unsigned char *)s;
4981 e = q + size;
4982
4983 if (byteorder)
4984 bo = *byteorder;
4985
4986 /* Check for BOM marks (U+FEFF) in the input and adjust current
4987 byte order setting accordingly. In native mode, the leading BOM
4988 mark is skipped, in all other modes, it is copied to the output
4989 stream as-is (giving a ZWNBSP character). */
4990 if (bo == 0) {
4991 if (size >= 4) {
4992 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004993 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004994#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004995 if (bom == 0x0000FEFF) {
4996 q += 4;
4997 bo = -1;
4998 }
4999 else if (bom == 0xFFFE0000) {
5000 q += 4;
5001 bo = 1;
5002 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005003#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005004 if (bom == 0x0000FEFF) {
5005 q += 4;
5006 bo = 1;
5007 }
5008 else if (bom == 0xFFFE0000) {
5009 q += 4;
5010 bo = -1;
5011 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005012#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005013 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005014 }
5015
5016 if (bo == -1) {
5017 /* force LE */
5018 iorder[0] = 0;
5019 iorder[1] = 1;
5020 iorder[2] = 2;
5021 iorder[3] = 3;
5022 }
5023 else if (bo == 1) {
5024 /* force BE */
5025 iorder[0] = 3;
5026 iorder[1] = 2;
5027 iorder[2] = 1;
5028 iorder[3] = 0;
5029 }
5030
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005031 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005032 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005033 if (!unicode)
5034 return NULL;
5035 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005036 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005037 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005038
Walter Dörwald41980ca2007-08-16 21:55:45 +00005039 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005040 Py_UCS4 ch;
5041 /* remaining bytes at the end? (size should be divisible by 4) */
5042 if (e-q<4) {
5043 if (consumed)
5044 break;
5045 errmsg = "truncated data";
5046 startinpos = ((const char *)q)-starts;
5047 endinpos = ((const char *)e)-starts;
5048 goto utf32Error;
5049 /* The remaining input chars are ignored if the callback
5050 chooses to skip the input */
5051 }
5052 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5053 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005054
Benjamin Peterson29060642009-01-31 22:14:21 +00005055 if (ch >= 0x110000)
5056 {
5057 errmsg = "codepoint not in range(0x110000)";
5058 startinpos = ((const char *)q)-starts;
5059 endinpos = startinpos+4;
5060 goto utf32Error;
5061 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005062 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5063 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005064 q += 4;
5065 continue;
5066 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005067 if (unicode_decode_call_errorhandler(
5068 errors, &errorHandler,
5069 "utf32", errmsg,
5070 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005071 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005072 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005073 }
5074
5075 if (byteorder)
5076 *byteorder = bo;
5077
5078 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005079 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005080
5081 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005082 if (unicode_resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005083 goto onError;
5084
5085 Py_XDECREF(errorHandler);
5086 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005087 return unicode_result(unicode);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005088
Benjamin Peterson29060642009-01-31 22:14:21 +00005089 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005090 Py_DECREF(unicode);
5091 Py_XDECREF(errorHandler);
5092 Py_XDECREF(exc);
5093 return NULL;
5094}
5095
5096PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005097_PyUnicode_EncodeUTF32(PyObject *str,
5098 const char *errors,
5099 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005100{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005101 int kind;
5102 void *data;
5103 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005104 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005105 unsigned char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005106 Py_ssize_t nsize, bytesize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005107 /* Offsets from p for storing byte pairs in the right order. */
5108#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5109 int iorder[] = {0, 1, 2, 3};
5110#else
5111 int iorder[] = {3, 2, 1, 0};
5112#endif
5113
Benjamin Peterson29060642009-01-31 22:14:21 +00005114#define STORECHAR(CH) \
5115 do { \
5116 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5117 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5118 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5119 p[iorder[0]] = (CH) & 0xff; \
5120 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005121 } while(0)
5122
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005123 if (!PyUnicode_Check(str)) {
5124 PyErr_BadArgument();
5125 return NULL;
5126 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005127 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005128 return NULL;
5129 kind = PyUnicode_KIND(str);
5130 data = PyUnicode_DATA(str);
5131 len = PyUnicode_GET_LENGTH(str);
5132
5133 nsize = len + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005134 bytesize = nsize * 4;
5135 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005136 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005137 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005138 if (v == NULL)
5139 return NULL;
5140
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005141 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005142 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005143 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005144 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005145 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005146
5147 if (byteorder == -1) {
5148 /* force LE */
5149 iorder[0] = 0;
5150 iorder[1] = 1;
5151 iorder[2] = 2;
5152 iorder[3] = 3;
5153 }
5154 else if (byteorder == 1) {
5155 /* force BE */
5156 iorder[0] = 3;
5157 iorder[1] = 2;
5158 iorder[2] = 1;
5159 iorder[3] = 0;
5160 }
5161
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005162 for (i = 0; i < len; i++)
5163 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005164
5165 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005166 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005167#undef STORECHAR
5168}
5169
Alexander Belopolsky40018472011-02-26 01:02:56 +00005170PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005171PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5172 Py_ssize_t size,
5173 const char *errors,
5174 int byteorder)
5175{
5176 PyObject *result;
5177 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5178 if (tmp == NULL)
5179 return NULL;
5180 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5181 Py_DECREF(tmp);
5182 return result;
5183}
5184
5185PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005186PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005187{
Victor Stinnerb960b342011-11-20 19:12:52 +01005188 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005189}
5190
Guido van Rossumd57fd912000-03-10 22:53:23 +00005191/* --- UTF-16 Codec ------------------------------------------------------- */
5192
Tim Peters772747b2001-08-09 22:21:55 +00005193PyObject *
5194PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005195 Py_ssize_t size,
5196 const char *errors,
5197 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005198{
Walter Dörwald69652032004-09-07 20:24:22 +00005199 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5200}
5201
5202PyObject *
5203PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005204 Py_ssize_t size,
5205 const char *errors,
5206 int *byteorder,
5207 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005208{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005209 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005210 Py_ssize_t startinpos;
5211 Py_ssize_t endinpos;
5212 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005213 PyObject *unicode;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005214 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005215 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005216 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005217 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005218 PyObject *errorHandler = NULL;
5219 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005220
Tim Peters772747b2001-08-09 22:21:55 +00005221 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005222 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005223
5224 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005225 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005226
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005227 /* Check for BOM marks (U+FEFF) in the input and adjust current
5228 byte order setting accordingly. In native mode, the leading BOM
5229 mark is skipped, in all other modes, it is copied to the output
5230 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005231 if (bo == 0 && size >= 2) {
5232 const Py_UCS4 bom = (q[1] << 8) | q[0];
5233 if (bom == 0xFEFF) {
5234 q += 2;
5235 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005236 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005237 else if (bom == 0xFFFE) {
5238 q += 2;
5239 bo = 1;
5240 }
5241 if (byteorder)
5242 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005243 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005244
Antoine Pitrou63065d72012-05-15 23:48:04 +02005245 if (q == e) {
5246 if (consumed)
5247 *consumed = size;
5248 Py_INCREF(unicode_empty);
5249 return unicode_empty;
Tim Peters772747b2001-08-09 22:21:55 +00005250 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005251
Antoine Pitrouab868312009-01-10 15:40:25 +00005252#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005253 native_ordering = bo <= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005254#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005255 native_ordering = bo >= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005256#endif
Tim Peters772747b2001-08-09 22:21:55 +00005257
Antoine Pitrou63065d72012-05-15 23:48:04 +02005258 /* Note: size will always be longer than the resulting Unicode
5259 character count */
5260 unicode = PyUnicode_New((e - q + 1) / 2, 127);
5261 if (!unicode)
5262 return NULL;
5263
5264 outpos = 0;
5265 while (1) {
5266 Py_UCS4 ch = 0;
5267 if (e - q >= 2) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005268 int kind = PyUnicode_KIND(unicode);
Antoine Pitrou63065d72012-05-15 23:48:04 +02005269 if (kind == PyUnicode_1BYTE_KIND) {
5270 if (PyUnicode_IS_ASCII(unicode))
5271 ch = asciilib_utf16_decode(&q, e,
5272 PyUnicode_1BYTE_DATA(unicode), &outpos,
5273 native_ordering);
5274 else
5275 ch = ucs1lib_utf16_decode(&q, e,
5276 PyUnicode_1BYTE_DATA(unicode), &outpos,
5277 native_ordering);
5278 } else if (kind == PyUnicode_2BYTE_KIND) {
5279 ch = ucs2lib_utf16_decode(&q, e,
5280 PyUnicode_2BYTE_DATA(unicode), &outpos,
5281 native_ordering);
5282 } else {
5283 assert(kind == PyUnicode_4BYTE_KIND);
5284 ch = ucs4lib_utf16_decode(&q, e,
5285 PyUnicode_4BYTE_DATA(unicode), &outpos,
5286 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005287 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005288 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005289
Antoine Pitrou63065d72012-05-15 23:48:04 +02005290 switch (ch)
5291 {
5292 case 0:
5293 /* remaining byte at the end? (size should be even) */
5294 if (q == e || consumed)
5295 goto End;
5296 errmsg = "truncated data";
5297 startinpos = ((const char *)q) - starts;
5298 endinpos = ((const char *)e) - starts;
5299 break;
5300 /* The remaining input chars are ignored if the callback
5301 chooses to skip the input */
5302 case 1:
5303 errmsg = "unexpected end of data";
5304 startinpos = ((const char *)q) - 2 - starts;
5305 endinpos = ((const char *)e) - starts;
5306 break;
5307 case 2:
5308 errmsg = "illegal encoding";
5309 startinpos = ((const char *)q) - 2 - starts;
5310 endinpos = startinpos + 2;
5311 break;
5312 case 3:
5313 errmsg = "illegal UTF-16 surrogate";
5314 startinpos = ((const char *)q) - 4 - starts;
5315 endinpos = startinpos + 2;
5316 break;
5317 default:
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005318 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5319 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005320 continue;
5321 }
5322
Benjamin Peterson29060642009-01-31 22:14:21 +00005323 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005324 errors,
5325 &errorHandler,
5326 "utf16", errmsg,
5327 &starts,
5328 (const char **)&e,
5329 &startinpos,
5330 &endinpos,
5331 &exc,
5332 (const char **)&q,
5333 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005334 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005335 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005336 }
5337
Antoine Pitrou63065d72012-05-15 23:48:04 +02005338End:
Walter Dörwald69652032004-09-07 20:24:22 +00005339 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005340 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005341
Guido van Rossumd57fd912000-03-10 22:53:23 +00005342 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005343 if (unicode_resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005344 goto onError;
5345
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005346 Py_XDECREF(errorHandler);
5347 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005348 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005349
Benjamin Peterson29060642009-01-31 22:14:21 +00005350 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005351 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005352 Py_XDECREF(errorHandler);
5353 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005354 return NULL;
5355}
5356
Tim Peters772747b2001-08-09 22:21:55 +00005357PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005358_PyUnicode_EncodeUTF16(PyObject *str,
5359 const char *errors,
5360 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005361{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005362 enum PyUnicode_Kind kind;
5363 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005364 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005365 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005366 unsigned short *out;
5367 Py_ssize_t bytesize;
5368 Py_ssize_t pairs;
5369#ifdef WORDS_BIGENDIAN
5370 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005371#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005372 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005373#endif
5374
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005375 if (!PyUnicode_Check(str)) {
5376 PyErr_BadArgument();
5377 return NULL;
5378 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005379 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005380 return NULL;
5381 kind = PyUnicode_KIND(str);
5382 data = PyUnicode_DATA(str);
5383 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005384
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005385 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005386 if (kind == PyUnicode_4BYTE_KIND) {
5387 const Py_UCS4 *in = (const Py_UCS4 *)data;
5388 const Py_UCS4 *end = in + len;
5389 while (in < end)
5390 if (*in++ >= 0x10000)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005391 pairs++;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005392 }
5393 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005394 return PyErr_NoMemory();
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005395 bytesize = (len + pairs + (byteorder == 0)) * 2;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005396 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005397 if (v == NULL)
5398 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005399
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005400 /* output buffer is 2-bytes aligned */
5401 assert(((Py_uintptr_t)PyBytes_AS_STRING(v) & 1) == 0);
5402 out = (unsigned short *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005403 if (byteorder == 0)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005404 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005405 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005406 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005407
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005408 switch (kind) {
5409 case PyUnicode_1BYTE_KIND: {
5410 ucs1lib_utf16_encode(out, (const Py_UCS1 *)data, len, native_ordering);
5411 break;
Tim Peters772747b2001-08-09 22:21:55 +00005412 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005413 case PyUnicode_2BYTE_KIND: {
5414 ucs2lib_utf16_encode(out, (const Py_UCS2 *)data, len, native_ordering);
5415 break;
Tim Peters772747b2001-08-09 22:21:55 +00005416 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005417 case PyUnicode_4BYTE_KIND: {
5418 ucs4lib_utf16_encode(out, (const Py_UCS4 *)data, len, native_ordering);
5419 break;
5420 }
5421 default:
5422 assert(0);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005423 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005424
5425 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005426 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005427}
5428
Alexander Belopolsky40018472011-02-26 01:02:56 +00005429PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005430PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5431 Py_ssize_t size,
5432 const char *errors,
5433 int byteorder)
5434{
5435 PyObject *result;
5436 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5437 if (tmp == NULL)
5438 return NULL;
5439 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5440 Py_DECREF(tmp);
5441 return result;
5442}
5443
5444PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005445PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005446{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005447 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005448}
5449
5450/* --- Unicode Escape Codec ----------------------------------------------- */
5451
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005452/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5453 if all the escapes in the string make it still a valid ASCII string.
5454 Returns -1 if any escapes were found which cause the string to
5455 pop out of ASCII range. Otherwise returns the length of the
5456 required buffer to hold the string.
5457 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005458static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005459length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5460{
5461 const unsigned char *p = (const unsigned char *)s;
5462 const unsigned char *end = p + size;
5463 Py_ssize_t length = 0;
5464
5465 if (size < 0)
5466 return -1;
5467
5468 for (; p < end; ++p) {
5469 if (*p > 127) {
5470 /* Non-ASCII */
5471 return -1;
5472 }
5473 else if (*p != '\\') {
5474 /* Normal character */
5475 ++length;
5476 }
5477 else {
5478 /* Backslash-escape, check next char */
5479 ++p;
5480 /* Escape sequence reaches till end of string or
5481 non-ASCII follow-up. */
5482 if (p >= end || *p > 127)
5483 return -1;
5484 switch (*p) {
5485 case '\n':
5486 /* backslash + \n result in zero characters */
5487 break;
5488 case '\\': case '\'': case '\"':
5489 case 'b': case 'f': case 't':
5490 case 'n': case 'r': case 'v': case 'a':
5491 ++length;
5492 break;
5493 case '0': case '1': case '2': case '3':
5494 case '4': case '5': case '6': case '7':
5495 case 'x': case 'u': case 'U': case 'N':
5496 /* these do not guarantee ASCII characters */
5497 return -1;
5498 default:
5499 /* count the backslash + the other character */
5500 length += 2;
5501 }
5502 }
5503 }
5504 return length;
5505}
5506
Fredrik Lundh06d12682001-01-24 07:59:11 +00005507static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005508
Alexander Belopolsky40018472011-02-26 01:02:56 +00005509PyObject *
5510PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005511 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005512 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005513{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005514 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005515 Py_ssize_t startinpos;
5516 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005517 int j;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005518 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005519 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005520 char* message;
5521 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005522 PyObject *errorHandler = NULL;
5523 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005524 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005525 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005526
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005527 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005528
5529 /* After length_of_escaped_ascii_string() there are two alternatives,
5530 either the string is pure ASCII with named escapes like \n, etc.
5531 and we determined it's exact size (common case)
5532 or it contains \x, \u, ... escape sequences. then we create a
5533 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005534 if (len >= 0) {
5535 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005536 if (!v)
5537 goto onError;
5538 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005539 }
5540 else {
5541 /* Escaped strings will always be longer than the resulting
5542 Unicode string, so we start with size here and then reduce the
5543 length after conversion to the true value.
5544 (but if the error callback returns a long replacement string
5545 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005546 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005547 if (!v)
5548 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005549 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005550 }
5551
Guido van Rossumd57fd912000-03-10 22:53:23 +00005552 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005553 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005554 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005555 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005556
Guido van Rossumd57fd912000-03-10 22:53:23 +00005557 while (s < end) {
5558 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005559 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005560 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005561
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005562 /* The only case in which i == ascii_length is a backslash
5563 followed by a newline. */
5564 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005565
Guido van Rossumd57fd912000-03-10 22:53:23 +00005566 /* Non-escape characters are interpreted as Unicode ordinals */
5567 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005568 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5569 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005570 continue;
5571 }
5572
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005573 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005574 /* \ - Escapes */
5575 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005576 c = *s++;
5577 if (s > end)
5578 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005579
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005580 /* The only case in which i == ascii_length is a backslash
5581 followed by a newline. */
5582 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005583
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005584 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005585
Benjamin Peterson29060642009-01-31 22:14:21 +00005586 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005587#define WRITECHAR(ch) \
5588 do { \
5589 if (unicode_putchar(&v, &i, ch) < 0) \
5590 goto onError; \
5591 }while(0)
5592
Guido van Rossumd57fd912000-03-10 22:53:23 +00005593 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005594 case '\\': WRITECHAR('\\'); break;
5595 case '\'': WRITECHAR('\''); break;
5596 case '\"': WRITECHAR('\"'); break;
5597 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005598 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005599 case 'f': WRITECHAR('\014'); break;
5600 case 't': WRITECHAR('\t'); break;
5601 case 'n': WRITECHAR('\n'); break;
5602 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005603 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005604 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005605 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005606 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005607
Benjamin Peterson29060642009-01-31 22:14:21 +00005608 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005609 case '0': case '1': case '2': case '3':
5610 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005611 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005612 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005613 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005614 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005615 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005616 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005617 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005618 break;
5619
Benjamin Peterson29060642009-01-31 22:14:21 +00005620 /* hex escapes */
5621 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005622 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005623 digits = 2;
5624 message = "truncated \\xXX escape";
5625 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005626
Benjamin Peterson29060642009-01-31 22:14:21 +00005627 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005628 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005629 digits = 4;
5630 message = "truncated \\uXXXX escape";
5631 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005632
Benjamin Peterson29060642009-01-31 22:14:21 +00005633 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005634 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005635 digits = 8;
5636 message = "truncated \\UXXXXXXXX escape";
5637 hexescape:
5638 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005639 if (s+digits>end) {
5640 endinpos = size;
5641 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005642 errors, &errorHandler,
5643 "unicodeescape", "end of string in escape sequence",
5644 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005645 &v, &i))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005646 goto onError;
5647 goto nextByte;
5648 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005649 for (j = 0; j < digits; ++j) {
5650 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005651 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005652 endinpos = (s+j+1)-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005653 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005654 errors, &errorHandler,
5655 "unicodeescape", message,
5656 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005657 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005658 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005659 len = PyUnicode_GET_LENGTH(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005660 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005661 }
5662 chr = (chr<<4) & ~0xF;
5663 if (c >= '0' && c <= '9')
5664 chr += c - '0';
5665 else if (c >= 'a' && c <= 'f')
5666 chr += 10 + c - 'a';
5667 else
5668 chr += 10 + c - 'A';
5669 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005670 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005671 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005672 /* _decoding_error will have already written into the
5673 target buffer. */
5674 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005675 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005676 /* when we get here, chr is a 32-bit unicode character */
Victor Stinner8faf8212011-12-08 22:14:11 +01005677 if (chr <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005678 WRITECHAR(chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005679 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005680 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005681 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005682 errors, &errorHandler,
5683 "unicodeescape", "illegal Unicode character",
5684 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005685 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005686 goto onError;
5687 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005688 break;
5689
Benjamin Peterson29060642009-01-31 22:14:21 +00005690 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005691 case 'N':
5692 message = "malformed \\N character escape";
5693 if (ucnhash_CAPI == NULL) {
5694 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005695 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5696 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005697 if (ucnhash_CAPI == NULL)
5698 goto ucnhashError;
5699 }
5700 if (*s == '{') {
5701 const char *start = s+1;
5702 /* look for the closing brace */
5703 while (*s != '}' && s < end)
5704 s++;
5705 if (s > start && s < end && *s == '}') {
5706 /* found a name. look it up in the unicode database */
5707 message = "unknown Unicode character name";
5708 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005709 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005710 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005711 goto store;
5712 }
5713 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005714 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005715 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005716 errors, &errorHandler,
5717 "unicodeescape", message,
5718 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005719 &v, &i))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005720 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005721 break;
5722
5723 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005724 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005725 message = "\\ at end of string";
5726 s--;
5727 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005728 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005729 errors, &errorHandler,
5730 "unicodeescape", message,
5731 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005732 &v, &i))
Walter Dörwald8c077222002-03-25 11:16:18 +00005733 goto onError;
5734 }
5735 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005736 WRITECHAR('\\');
5737 WRITECHAR(s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005738 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005739 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005740 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005741 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005742 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005743 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005744#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005745
Victor Stinner16e6a802011-12-12 13:24:15 +01005746 if (unicode_resize(&v, i) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005747 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005748 Py_XDECREF(errorHandler);
5749 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005750 return unicode_result(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005751
Benjamin Peterson29060642009-01-31 22:14:21 +00005752 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005753 PyErr_SetString(
5754 PyExc_UnicodeError,
5755 "\\N escapes not supported (can't load unicodedata module)"
5756 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005757 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005758 Py_XDECREF(errorHandler);
5759 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005760 return NULL;
5761
Benjamin Peterson29060642009-01-31 22:14:21 +00005762 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005763 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005764 Py_XDECREF(errorHandler);
5765 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005766 return NULL;
5767}
5768
5769/* Return a Unicode-Escape string version of the Unicode object.
5770
5771 If quotes is true, the string is enclosed in u"" or u'' quotes as
5772 appropriate.
5773
5774*/
5775
Alexander Belopolsky40018472011-02-26 01:02:56 +00005776PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005777PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005778{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005779 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005780 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005781 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005782 int kind;
5783 void *data;
5784 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005785
Thomas Wouters89f507f2006-12-13 04:49:30 +00005786 /* Initial allocation is based on the longest-possible unichr
5787 escape.
5788
5789 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5790 unichr, so in this case it's the longest unichr escape. In
5791 narrow (UTF-16) builds this is five chars per source unichr
5792 since there are two unichrs in the surrogate pair, so in narrow
5793 (UTF-16) builds it's not the longest unichr escape.
5794
5795 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5796 so in the narrow (UTF-16) build case it's the longest unichr
5797 escape.
5798 */
5799
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005800 if (!PyUnicode_Check(unicode)) {
5801 PyErr_BadArgument();
5802 return NULL;
5803 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005804 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005805 return NULL;
5806 len = PyUnicode_GET_LENGTH(unicode);
5807 kind = PyUnicode_KIND(unicode);
5808 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06005809 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005810 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5811 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5812 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5813 }
5814
5815 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005816 return PyBytes_FromStringAndSize(NULL, 0);
5817
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005818 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005819 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005820
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005821 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005822 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005823 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005824 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005825 if (repr == NULL)
5826 return NULL;
5827
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005828 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005829
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005830 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005831 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005832
Walter Dörwald79e913e2007-05-12 11:08:06 +00005833 /* Escape backslashes */
5834 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005835 *p++ = '\\';
5836 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005837 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005838 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005839
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005840 /* Map 21-bit characters to '\U00xxxxxx' */
5841 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005842 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005843 *p++ = '\\';
5844 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005845 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5846 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5847 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5848 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5849 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5850 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5851 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5852 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005853 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005854 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005855
Guido van Rossumd57fd912000-03-10 22:53:23 +00005856 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005857 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005858 *p++ = '\\';
5859 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005860 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5861 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5862 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5863 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005864 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005865
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005866 /* Map special whitespace to '\t', \n', '\r' */
5867 else if (ch == '\t') {
5868 *p++ = '\\';
5869 *p++ = 't';
5870 }
5871 else if (ch == '\n') {
5872 *p++ = '\\';
5873 *p++ = 'n';
5874 }
5875 else if (ch == '\r') {
5876 *p++ = '\\';
5877 *p++ = 'r';
5878 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005879
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005880 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005881 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005882 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005883 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005884 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5885 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005886 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005887
Guido van Rossumd57fd912000-03-10 22:53:23 +00005888 /* Copy everything else as-is */
5889 else
5890 *p++ = (char) ch;
5891 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005892
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005893 assert(p - PyBytes_AS_STRING(repr) > 0);
5894 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5895 return NULL;
5896 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005897}
5898
Alexander Belopolsky40018472011-02-26 01:02:56 +00005899PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005900PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5901 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005902{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005903 PyObject *result;
5904 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5905 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005906 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005907 result = PyUnicode_AsUnicodeEscapeString(tmp);
5908 Py_DECREF(tmp);
5909 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005910}
5911
5912/* --- Raw Unicode Escape Codec ------------------------------------------- */
5913
Alexander Belopolsky40018472011-02-26 01:02:56 +00005914PyObject *
5915PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005916 Py_ssize_t size,
5917 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005918{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005919 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005920 Py_ssize_t startinpos;
5921 Py_ssize_t endinpos;
5922 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005923 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005924 const char *end;
5925 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005926 PyObject *errorHandler = NULL;
5927 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005928
Guido van Rossumd57fd912000-03-10 22:53:23 +00005929 /* Escaped strings will always be longer than the resulting
5930 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005931 length after conversion to the true value. (But decoding error
5932 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005933 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005934 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005935 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005936 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005937 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005938 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005939 end = s + size;
5940 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005941 unsigned char c;
5942 Py_UCS4 x;
5943 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005944 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005945
Benjamin Peterson29060642009-01-31 22:14:21 +00005946 /* Non-escape characters are interpreted as Unicode ordinals */
5947 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005948 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
5949 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005950 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005951 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005952 startinpos = s-starts;
5953
5954 /* \u-escapes are only interpreted iff the number of leading
5955 backslashes if odd */
5956 bs = s;
5957 for (;s < end;) {
5958 if (*s != '\\')
5959 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005960 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
5961 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005962 }
5963 if (((s - bs) & 1) == 0 ||
5964 s >= end ||
5965 (*s != 'u' && *s != 'U')) {
5966 continue;
5967 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005968 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00005969 count = *s=='u' ? 4 : 8;
5970 s++;
5971
5972 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00005973 for (x = 0, i = 0; i < count; ++i, ++s) {
5974 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005975 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005976 endinpos = s-starts;
5977 if (unicode_decode_call_errorhandler(
5978 errors, &errorHandler,
5979 "rawunicodeescape", "truncated \\uXXXX",
5980 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005981 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005982 goto onError;
5983 goto nextByte;
5984 }
5985 x = (x<<4) & ~0xF;
5986 if (c >= '0' && c <= '9')
5987 x += c - '0';
5988 else if (c >= 'a' && c <= 'f')
5989 x += 10 + c - 'a';
5990 else
5991 x += 10 + c - 'A';
5992 }
Victor Stinner8faf8212011-12-08 22:14:11 +01005993 if (x <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005994 if (unicode_putchar(&v, &outpos, x) < 0)
5995 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005996 } else {
5997 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005998 if (unicode_decode_call_errorhandler(
5999 errors, &errorHandler,
6000 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006001 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006002 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006003 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006004 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006005 nextByte:
6006 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006007 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006008 if (unicode_resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006009 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006010 Py_XDECREF(errorHandler);
6011 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006012 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00006013
Benjamin Peterson29060642009-01-31 22:14:21 +00006014 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006015 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006016 Py_XDECREF(errorHandler);
6017 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006018 return NULL;
6019}
6020
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006021
Alexander Belopolsky40018472011-02-26 01:02:56 +00006022PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006023PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006024{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006025 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006026 char *p;
6027 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006028 Py_ssize_t expandsize, pos;
6029 int kind;
6030 void *data;
6031 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006032
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006033 if (!PyUnicode_Check(unicode)) {
6034 PyErr_BadArgument();
6035 return NULL;
6036 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006037 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006038 return NULL;
6039 kind = PyUnicode_KIND(unicode);
6040 data = PyUnicode_DATA(unicode);
6041 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006042 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6043 bytes, and 1 byte characters 4. */
6044 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006045
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006046 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006047 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006048
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006049 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006050 if (repr == NULL)
6051 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006052 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006053 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006054
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006055 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006056 for (pos = 0; pos < len; pos++) {
6057 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006058 /* Map 32-bit characters to '\Uxxxxxxxx' */
6059 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006060 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006061 *p++ = '\\';
6062 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006063 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6064 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6065 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6066 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6067 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6068 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6069 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6070 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006071 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006072 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006073 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006074 *p++ = '\\';
6075 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006076 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6077 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6078 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6079 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006080 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006081 /* Copy everything else as-is */
6082 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006083 *p++ = (char) ch;
6084 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006085
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006086 assert(p > q);
6087 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006088 return NULL;
6089 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006090}
6091
Alexander Belopolsky40018472011-02-26 01:02:56 +00006092PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006093PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6094 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006095{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006096 PyObject *result;
6097 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6098 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006099 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006100 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6101 Py_DECREF(tmp);
6102 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006103}
6104
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006105/* --- Unicode Internal Codec ------------------------------------------- */
6106
Alexander Belopolsky40018472011-02-26 01:02:56 +00006107PyObject *
6108_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006109 Py_ssize_t size,
6110 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006111{
6112 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006113 Py_ssize_t startinpos;
6114 Py_ssize_t endinpos;
6115 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006116 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006117 const char *end;
6118 const char *reason;
6119 PyObject *errorHandler = NULL;
6120 PyObject *exc = NULL;
6121
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006122 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006123 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006124 1))
6125 return NULL;
6126
Thomas Wouters89f507f2006-12-13 04:49:30 +00006127 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006128 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006129 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006130 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006131 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006132 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006133 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006134 end = s + size;
6135
6136 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006137 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006138 Py_UCS4 ch;
6139 /* We copy the raw representation one byte at a time because the
6140 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006141 ((char *) &uch)[0] = s[0];
6142 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006143#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006144 ((char *) &uch)[2] = s[2];
6145 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006146#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006147 ch = uch;
6148
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006149 /* We have to sanity check the raw data, otherwise doom looms for
6150 some malformed UCS-4 data. */
6151 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006152#ifdef Py_UNICODE_WIDE
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006153 ch > 0x10ffff ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006154#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006155 end-s < Py_UNICODE_SIZE
6156 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006157 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006158 startinpos = s - starts;
6159 if (end-s < Py_UNICODE_SIZE) {
6160 endinpos = end-starts;
6161 reason = "truncated input";
6162 }
6163 else {
6164 endinpos = s - starts + Py_UNICODE_SIZE;
6165 reason = "illegal code point (> 0x10FFFF)";
6166 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006167 if (unicode_decode_call_errorhandler(
6168 errors, &errorHandler,
6169 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006170 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006171 &v, &outpos))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006172 goto onError;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006173 continue;
6174 }
6175
6176 s += Py_UNICODE_SIZE;
6177#ifndef Py_UNICODE_WIDE
Victor Stinner551ac952011-11-29 22:58:13 +01006178 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006179 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006180 Py_UNICODE uch2;
6181 ((char *) &uch2)[0] = s[0];
6182 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006183 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006184 {
Victor Stinner551ac952011-11-29 22:58:13 +01006185 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006186 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006187 }
6188 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006189#endif
6190
6191 if (unicode_putchar(&v, &outpos, ch) < 0)
6192 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006193 }
6194
Victor Stinner16e6a802011-12-12 13:24:15 +01006195 if (unicode_resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006196 goto onError;
6197 Py_XDECREF(errorHandler);
6198 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006199 return unicode_result(v);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006200
Benjamin Peterson29060642009-01-31 22:14:21 +00006201 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006202 Py_XDECREF(v);
6203 Py_XDECREF(errorHandler);
6204 Py_XDECREF(exc);
6205 return NULL;
6206}
6207
Guido van Rossumd57fd912000-03-10 22:53:23 +00006208/* --- Latin-1 Codec ------------------------------------------------------ */
6209
Alexander Belopolsky40018472011-02-26 01:02:56 +00006210PyObject *
6211PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006212 Py_ssize_t size,
6213 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006214{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006215 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006216 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006217}
6218
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006219/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006220static void
6221make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006222 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006223 PyObject *unicode,
6224 Py_ssize_t startpos, Py_ssize_t endpos,
6225 const char *reason)
6226{
6227 if (*exceptionObject == NULL) {
6228 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006229 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006230 encoding, unicode, startpos, endpos, reason);
6231 }
6232 else {
6233 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6234 goto onError;
6235 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6236 goto onError;
6237 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6238 goto onError;
6239 return;
6240 onError:
6241 Py_DECREF(*exceptionObject);
6242 *exceptionObject = NULL;
6243 }
6244}
6245
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006246/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006247static void
6248raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006249 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006250 PyObject *unicode,
6251 Py_ssize_t startpos, Py_ssize_t endpos,
6252 const char *reason)
6253{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006254 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006255 encoding, unicode, startpos, endpos, reason);
6256 if (*exceptionObject != NULL)
6257 PyCodec_StrictErrors(*exceptionObject);
6258}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006259
6260/* error handling callback helper:
6261 build arguments, call the callback and check the arguments,
6262 put the result into newpos and return the replacement string, which
6263 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006264static PyObject *
6265unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006266 PyObject **errorHandler,
6267 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006268 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006269 Py_ssize_t startpos, Py_ssize_t endpos,
6270 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006271{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006272 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006273 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006274 PyObject *restuple;
6275 PyObject *resunicode;
6276
6277 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006278 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006279 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006280 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006281 }
6282
Benjamin Petersonbac79492012-01-14 13:34:47 -05006283 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006284 return NULL;
6285 len = PyUnicode_GET_LENGTH(unicode);
6286
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006287 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006288 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006289 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006290 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006291
6292 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006293 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006294 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006295 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006296 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006297 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006298 Py_DECREF(restuple);
6299 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006300 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006301 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006302 &resunicode, newpos)) {
6303 Py_DECREF(restuple);
6304 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006305 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006306 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6307 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6308 Py_DECREF(restuple);
6309 return NULL;
6310 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006311 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006312 *newpos = len + *newpos;
6313 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006314 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6315 Py_DECREF(restuple);
6316 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006317 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006318 Py_INCREF(resunicode);
6319 Py_DECREF(restuple);
6320 return resunicode;
6321}
6322
Alexander Belopolsky40018472011-02-26 01:02:56 +00006323static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006324unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006325 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006326 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006327{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006328 /* input state */
6329 Py_ssize_t pos=0, size;
6330 int kind;
6331 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006332 /* output object */
6333 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006334 /* pointer into the output */
6335 char *str;
6336 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006337 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006338 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6339 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006340 PyObject *errorHandler = NULL;
6341 PyObject *exc = NULL;
6342 /* the following variable is used for caching string comparisons
6343 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6344 int known_errorHandler = -1;
6345
Benjamin Petersonbac79492012-01-14 13:34:47 -05006346 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006347 return NULL;
6348 size = PyUnicode_GET_LENGTH(unicode);
6349 kind = PyUnicode_KIND(unicode);
6350 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006351 /* allocate enough for a simple encoding without
6352 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006353 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006354 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006355 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006356 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006357 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006358 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006359 ressize = size;
6360
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006361 while (pos < size) {
6362 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006363
Benjamin Peterson29060642009-01-31 22:14:21 +00006364 /* can we encode this? */
6365 if (c<limit) {
6366 /* no overflow check, because we know that the space is enough */
6367 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006368 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006369 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006370 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006371 Py_ssize_t requiredsize;
6372 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006373 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006374 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006375 Py_ssize_t collstart = pos;
6376 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006377 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006378 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006379 ++collend;
6380 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6381 if (known_errorHandler==-1) {
6382 if ((errors==NULL) || (!strcmp(errors, "strict")))
6383 known_errorHandler = 1;
6384 else if (!strcmp(errors, "replace"))
6385 known_errorHandler = 2;
6386 else if (!strcmp(errors, "ignore"))
6387 known_errorHandler = 3;
6388 else if (!strcmp(errors, "xmlcharrefreplace"))
6389 known_errorHandler = 4;
6390 else
6391 known_errorHandler = 0;
6392 }
6393 switch (known_errorHandler) {
6394 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006395 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006396 goto onError;
6397 case 2: /* replace */
6398 while (collstart++<collend)
6399 *str++ = '?'; /* fall through */
6400 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006401 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006402 break;
6403 case 4: /* xmlcharrefreplace */
6404 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006405 /* determine replacement size */
6406 for (i = collstart, repsize = 0; i < collend; ++i) {
6407 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6408 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006409 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006410 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006411 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006412 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006413 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006414 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006415 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006416 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006417 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006418 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006419 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006420 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006421 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006422 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006423 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006424 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006425 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006426 if (requiredsize > ressize) {
6427 if (requiredsize<2*ressize)
6428 requiredsize = 2*ressize;
6429 if (_PyBytes_Resize(&res, requiredsize))
6430 goto onError;
6431 str = PyBytes_AS_STRING(res) + respos;
6432 ressize = requiredsize;
6433 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006434 /* generate replacement */
6435 for (i = collstart; i < collend; ++i) {
6436 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006437 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006438 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006439 break;
6440 default:
6441 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006442 encoding, reason, unicode, &exc,
6443 collstart, collend, &newpos);
6444 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006445 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006446 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006447 if (PyBytes_Check(repunicode)) {
6448 /* Directly copy bytes result to output. */
6449 repsize = PyBytes_Size(repunicode);
6450 if (repsize > 1) {
6451 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006452 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006453 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6454 Py_DECREF(repunicode);
6455 goto onError;
6456 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006457 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006458 ressize += repsize-1;
6459 }
6460 memcpy(str, PyBytes_AsString(repunicode), repsize);
6461 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006462 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006463 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006464 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006465 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006466 /* need more space? (at least enough for what we
6467 have+the replacement+the rest of the string, so
6468 we won't have to check space for encodable characters) */
6469 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006470 repsize = PyUnicode_GET_LENGTH(repunicode);
6471 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006472 if (requiredsize > ressize) {
6473 if (requiredsize<2*ressize)
6474 requiredsize = 2*ressize;
6475 if (_PyBytes_Resize(&res, requiredsize)) {
6476 Py_DECREF(repunicode);
6477 goto onError;
6478 }
6479 str = PyBytes_AS_STRING(res) + respos;
6480 ressize = requiredsize;
6481 }
6482 /* check if there is anything unencodable in the replacement
6483 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006484 for (i = 0; repsize-->0; ++i, ++str) {
6485 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006486 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006487 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006488 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006489 Py_DECREF(repunicode);
6490 goto onError;
6491 }
6492 *str = (char)c;
6493 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006494 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006495 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006496 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006497 }
6498 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006499 /* Resize if we allocated to much */
6500 size = str - PyBytes_AS_STRING(res);
6501 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006502 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006503 if (_PyBytes_Resize(&res, size) < 0)
6504 goto onError;
6505 }
6506
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006507 Py_XDECREF(errorHandler);
6508 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006509 return res;
6510
6511 onError:
6512 Py_XDECREF(res);
6513 Py_XDECREF(errorHandler);
6514 Py_XDECREF(exc);
6515 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006516}
6517
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006518/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006519PyObject *
6520PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006521 Py_ssize_t size,
6522 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006523{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006524 PyObject *result;
6525 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6526 if (unicode == NULL)
6527 return NULL;
6528 result = unicode_encode_ucs1(unicode, errors, 256);
6529 Py_DECREF(unicode);
6530 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006531}
6532
Alexander Belopolsky40018472011-02-26 01:02:56 +00006533PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006534_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006535{
6536 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006537 PyErr_BadArgument();
6538 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006539 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006540 if (PyUnicode_READY(unicode) == -1)
6541 return NULL;
6542 /* Fast path: if it is a one-byte string, construct
6543 bytes object directly. */
6544 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6545 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6546 PyUnicode_GET_LENGTH(unicode));
6547 /* Non-Latin-1 characters present. Defer to above function to
6548 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006549 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006550}
6551
6552PyObject*
6553PyUnicode_AsLatin1String(PyObject *unicode)
6554{
6555 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006556}
6557
6558/* --- 7-bit ASCII Codec -------------------------------------------------- */
6559
Alexander Belopolsky40018472011-02-26 01:02:56 +00006560PyObject *
6561PyUnicode_DecodeASCII(const char *s,
6562 Py_ssize_t size,
6563 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006564{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006565 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006566 PyObject *unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006567 int kind;
6568 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006569 Py_ssize_t startinpos;
6570 Py_ssize_t endinpos;
6571 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006572 const char *e;
6573 PyObject *errorHandler = NULL;
6574 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006575
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006576 if (size == 0) {
6577 Py_INCREF(unicode_empty);
6578 return unicode_empty;
6579 }
6580
Guido van Rossumd57fd912000-03-10 22:53:23 +00006581 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006582 if (size == 1 && (unsigned char)s[0] < 128)
6583 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006584
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006585 unicode = PyUnicode_New(size, 127);
6586 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006587 goto onError;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006588
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006589 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006590 data = PyUnicode_1BYTE_DATA(unicode);
6591 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
6592 if (outpos == size)
6593 return unicode;
6594
6595 s += outpos;
6596 kind = PyUnicode_1BYTE_KIND;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006597 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006598 register unsigned char c = (unsigned char)*s;
6599 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006600 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006601 ++s;
6602 }
6603 else {
6604 startinpos = s-starts;
6605 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006606 if (unicode_decode_call_errorhandler(
6607 errors, &errorHandler,
6608 "ascii", "ordinal not in range(128)",
6609 &starts, &e, &startinpos, &endinpos, &exc, &s,
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006610 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006611 goto onError;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006612 kind = PyUnicode_KIND(unicode);
6613 data = PyUnicode_DATA(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00006614 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006615 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006616 if (unicode_resize(&unicode, outpos) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006617 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006618 Py_XDECREF(errorHandler);
6619 Py_XDECREF(exc);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006620 assert(_PyUnicode_CheckConsistency(unicode, 1));
6621 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00006622
Benjamin Peterson29060642009-01-31 22:14:21 +00006623 onError:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006624 Py_XDECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006625 Py_XDECREF(errorHandler);
6626 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006627 return NULL;
6628}
6629
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006630/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006631PyObject *
6632PyUnicode_EncodeASCII(const Py_UNICODE *p,
6633 Py_ssize_t size,
6634 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006635{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006636 PyObject *result;
6637 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6638 if (unicode == NULL)
6639 return NULL;
6640 result = unicode_encode_ucs1(unicode, errors, 128);
6641 Py_DECREF(unicode);
6642 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006643}
6644
Alexander Belopolsky40018472011-02-26 01:02:56 +00006645PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006646_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006647{
6648 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006649 PyErr_BadArgument();
6650 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006651 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006652 if (PyUnicode_READY(unicode) == -1)
6653 return NULL;
6654 /* Fast path: if it is an ASCII-only string, construct bytes object
6655 directly. Else defer to above function to raise the exception. */
6656 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6657 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6658 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006659 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006660}
6661
6662PyObject *
6663PyUnicode_AsASCIIString(PyObject *unicode)
6664{
6665 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006666}
6667
Victor Stinner99b95382011-07-04 14:23:54 +02006668#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006669
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006670/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006671
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006672#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006673#define NEED_RETRY
6674#endif
6675
Victor Stinner3a50e702011-10-18 21:21:00 +02006676#ifndef WC_ERR_INVALID_CHARS
6677# define WC_ERR_INVALID_CHARS 0x0080
6678#endif
6679
6680static char*
6681code_page_name(UINT code_page, PyObject **obj)
6682{
6683 *obj = NULL;
6684 if (code_page == CP_ACP)
6685 return "mbcs";
6686 if (code_page == CP_UTF7)
6687 return "CP_UTF7";
6688 if (code_page == CP_UTF8)
6689 return "CP_UTF8";
6690
6691 *obj = PyBytes_FromFormat("cp%u", code_page);
6692 if (*obj == NULL)
6693 return NULL;
6694 return PyBytes_AS_STRING(*obj);
6695}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006696
Alexander Belopolsky40018472011-02-26 01:02:56 +00006697static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006698is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006699{
6700 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006701 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006702
Victor Stinner3a50e702011-10-18 21:21:00 +02006703 if (!IsDBCSLeadByteEx(code_page, *curr))
6704 return 0;
6705
6706 prev = CharPrevExA(code_page, s, curr, 0);
6707 if (prev == curr)
6708 return 1;
6709 /* FIXME: This code is limited to "true" double-byte encodings,
6710 as it assumes an incomplete character consists of a single
6711 byte. */
6712 if (curr - prev == 2)
6713 return 1;
6714 if (!IsDBCSLeadByteEx(code_page, *prev))
6715 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006716 return 0;
6717}
6718
Victor Stinner3a50e702011-10-18 21:21:00 +02006719static DWORD
6720decode_code_page_flags(UINT code_page)
6721{
6722 if (code_page == CP_UTF7) {
6723 /* The CP_UTF7 decoder only supports flags=0 */
6724 return 0;
6725 }
6726 else
6727 return MB_ERR_INVALID_CHARS;
6728}
6729
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006730/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006731 * Decode a byte string from a Windows code page into unicode object in strict
6732 * mode.
6733 *
6734 * Returns consumed size if succeed, returns -2 on decode error, or raise a
6735 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006736 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006737static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006738decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006739 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006740 const char *in,
6741 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006742{
Victor Stinner3a50e702011-10-18 21:21:00 +02006743 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006744 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006745 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006746
6747 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006748 assert(insize > 0);
6749 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6750 if (outsize <= 0)
6751 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006752
6753 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006754 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01006755 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006756 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006757 if (*v == NULL)
6758 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006759 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006760 }
6761 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006762 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006763 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01006764 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006765 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006766 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006767 }
6768
6769 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006770 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6771 if (outsize <= 0)
6772 goto error;
6773 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006774
Victor Stinner3a50e702011-10-18 21:21:00 +02006775error:
6776 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6777 return -2;
6778 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006779 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006780}
6781
Victor Stinner3a50e702011-10-18 21:21:00 +02006782/*
6783 * Decode a byte string from a code page into unicode object with an error
6784 * handler.
6785 *
6786 * Returns consumed size if succeed, or raise a WindowsError or
6787 * UnicodeDecodeError exception and returns -1 on error.
6788 */
6789static int
6790decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006791 PyObject **v,
6792 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006793 const char *errors)
6794{
6795 const char *startin = in;
6796 const char *endin = in + size;
6797 const DWORD flags = decode_code_page_flags(code_page);
6798 /* Ideally, we should get reason from FormatMessage. This is the Windows
6799 2000 English version of the message. */
6800 const char *reason = "No mapping for the Unicode character exists "
6801 "in the target code page.";
6802 /* each step cannot decode more than 1 character, but a character can be
6803 represented as a surrogate pair */
6804 wchar_t buffer[2], *startout, *out;
6805 int insize, outsize;
6806 PyObject *errorHandler = NULL;
6807 PyObject *exc = NULL;
6808 PyObject *encoding_obj = NULL;
6809 char *encoding;
6810 DWORD err;
6811 int ret = -1;
6812
6813 assert(size > 0);
6814
6815 encoding = code_page_name(code_page, &encoding_obj);
6816 if (encoding == NULL)
6817 return -1;
6818
6819 if (errors == NULL || strcmp(errors, "strict") == 0) {
6820 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6821 UnicodeDecodeError. */
6822 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6823 if (exc != NULL) {
6824 PyCodec_StrictErrors(exc);
6825 Py_CLEAR(exc);
6826 }
6827 goto error;
6828 }
6829
6830 if (*v == NULL) {
6831 /* Create unicode object */
6832 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6833 PyErr_NoMemory();
6834 goto error;
6835 }
Victor Stinnerab595942011-12-17 04:59:06 +01006836 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006837 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006838 if (*v == NULL)
6839 goto error;
6840 startout = PyUnicode_AS_UNICODE(*v);
6841 }
6842 else {
6843 /* Extend unicode object */
6844 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6845 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6846 PyErr_NoMemory();
6847 goto error;
6848 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006849 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006850 goto error;
6851 startout = PyUnicode_AS_UNICODE(*v) + n;
6852 }
6853
6854 /* Decode the byte string character per character */
6855 out = startout;
6856 while (in < endin)
6857 {
6858 /* Decode a character */
6859 insize = 1;
6860 do
6861 {
6862 outsize = MultiByteToWideChar(code_page, flags,
6863 in, insize,
6864 buffer, Py_ARRAY_LENGTH(buffer));
6865 if (outsize > 0)
6866 break;
6867 err = GetLastError();
6868 if (err != ERROR_NO_UNICODE_TRANSLATION
6869 && err != ERROR_INSUFFICIENT_BUFFER)
6870 {
6871 PyErr_SetFromWindowsErr(0);
6872 goto error;
6873 }
6874 insize++;
6875 }
6876 /* 4=maximum length of a UTF-8 sequence */
6877 while (insize <= 4 && (in + insize) <= endin);
6878
6879 if (outsize <= 0) {
6880 Py_ssize_t startinpos, endinpos, outpos;
6881
6882 startinpos = in - startin;
6883 endinpos = startinpos + 1;
6884 outpos = out - PyUnicode_AS_UNICODE(*v);
6885 if (unicode_decode_call_errorhandler(
6886 errors, &errorHandler,
6887 encoding, reason,
6888 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01006889 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02006890 {
6891 goto error;
6892 }
Victor Stinner596a6c42011-11-09 00:02:18 +01006893 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02006894 }
6895 else {
6896 in += insize;
6897 memcpy(out, buffer, outsize * sizeof(wchar_t));
6898 out += outsize;
6899 }
6900 }
6901
6902 /* write a NUL character at the end */
6903 *out = 0;
6904
6905 /* Extend unicode object */
6906 outsize = out - startout;
6907 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01006908 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006909 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01006910 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02006911
6912error:
6913 Py_XDECREF(encoding_obj);
6914 Py_XDECREF(errorHandler);
6915 Py_XDECREF(exc);
6916 return ret;
6917}
6918
Victor Stinner3a50e702011-10-18 21:21:00 +02006919static PyObject *
6920decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006921 const char *s, Py_ssize_t size,
6922 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006923{
Victor Stinner76a31a62011-11-04 00:05:13 +01006924 PyObject *v = NULL;
6925 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006926
Victor Stinner3a50e702011-10-18 21:21:00 +02006927 if (code_page < 0) {
6928 PyErr_SetString(PyExc_ValueError, "invalid code page number");
6929 return NULL;
6930 }
6931
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006932 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006933 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006934
Victor Stinner76a31a62011-11-04 00:05:13 +01006935 do
6936 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006937#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01006938 if (size > INT_MAX) {
6939 chunk_size = INT_MAX;
6940 final = 0;
6941 done = 0;
6942 }
6943 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006944#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01006945 {
6946 chunk_size = (int)size;
6947 final = (consumed == NULL);
6948 done = 1;
6949 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006950
Victor Stinner76a31a62011-11-04 00:05:13 +01006951 /* Skip trailing lead-byte unless 'final' is set */
6952 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
6953 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006954
Victor Stinner76a31a62011-11-04 00:05:13 +01006955 if (chunk_size == 0 && done) {
6956 if (v != NULL)
6957 break;
6958 Py_INCREF(unicode_empty);
6959 return unicode_empty;
6960 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006961
Victor Stinner76a31a62011-11-04 00:05:13 +01006962
6963 converted = decode_code_page_strict(code_page, &v,
6964 s, chunk_size);
6965 if (converted == -2)
6966 converted = decode_code_page_errors(code_page, &v,
6967 s, chunk_size,
6968 errors);
6969 assert(converted != 0);
6970
6971 if (converted < 0) {
6972 Py_XDECREF(v);
6973 return NULL;
6974 }
6975
6976 if (consumed)
6977 *consumed += converted;
6978
6979 s += converted;
6980 size -= converted;
6981 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02006982
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006983 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006984}
6985
Alexander Belopolsky40018472011-02-26 01:02:56 +00006986PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02006987PyUnicode_DecodeCodePageStateful(int code_page,
6988 const char *s,
6989 Py_ssize_t size,
6990 const char *errors,
6991 Py_ssize_t *consumed)
6992{
6993 return decode_code_page_stateful(code_page, s, size, errors, consumed);
6994}
6995
6996PyObject *
6997PyUnicode_DecodeMBCSStateful(const char *s,
6998 Py_ssize_t size,
6999 const char *errors,
7000 Py_ssize_t *consumed)
7001{
7002 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7003}
7004
7005PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007006PyUnicode_DecodeMBCS(const char *s,
7007 Py_ssize_t size,
7008 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007009{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007010 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7011}
7012
Victor Stinner3a50e702011-10-18 21:21:00 +02007013static DWORD
7014encode_code_page_flags(UINT code_page, const char *errors)
7015{
7016 if (code_page == CP_UTF8) {
7017 if (winver.dwMajorVersion >= 6)
7018 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7019 and later */
7020 return WC_ERR_INVALID_CHARS;
7021 else
7022 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7023 return 0;
7024 }
7025 else if (code_page == CP_UTF7) {
7026 /* CP_UTF7 only supports flags=0 */
7027 return 0;
7028 }
7029 else {
7030 if (errors != NULL && strcmp(errors, "replace") == 0)
7031 return 0;
7032 else
7033 return WC_NO_BEST_FIT_CHARS;
7034 }
7035}
7036
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007037/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007038 * Encode a Unicode string to a Windows code page into a byte string in strict
7039 * mode.
7040 *
7041 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7042 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007043 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007044static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007045encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007046 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007047 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007048{
Victor Stinner554f3f02010-06-16 23:33:54 +00007049 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007050 BOOL *pusedDefaultChar = &usedDefaultChar;
7051 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007052 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007053 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007054 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007055 const DWORD flags = encode_code_page_flags(code_page, NULL);
7056 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007057 /* Create a substring so that we can get the UTF-16 representation
7058 of just the slice under consideration. */
7059 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007060
Martin v. Löwis3d325192011-11-04 18:23:06 +01007061 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007062
Victor Stinner3a50e702011-10-18 21:21:00 +02007063 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007064 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007065 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007066 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007067
Victor Stinner2fc507f2011-11-04 20:06:39 +01007068 substring = PyUnicode_Substring(unicode, offset, offset+len);
7069 if (substring == NULL)
7070 return -1;
7071 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7072 if (p == NULL) {
7073 Py_DECREF(substring);
7074 return -1;
7075 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01007076
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007077 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007078 outsize = WideCharToMultiByte(code_page, flags,
7079 p, size,
7080 NULL, 0,
7081 NULL, pusedDefaultChar);
7082 if (outsize <= 0)
7083 goto error;
7084 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007085 if (pusedDefaultChar && *pusedDefaultChar) {
7086 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007087 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007088 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007089
Victor Stinner3a50e702011-10-18 21:21:00 +02007090 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007091 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007092 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007093 if (*outbytes == NULL) {
7094 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007095 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007096 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007097 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007098 }
7099 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007100 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007101 const Py_ssize_t n = PyBytes_Size(*outbytes);
7102 if (outsize > PY_SSIZE_T_MAX - n) {
7103 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007104 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007105 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007106 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007107 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7108 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007109 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007110 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007111 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007112 }
7113
7114 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007115 outsize = WideCharToMultiByte(code_page, flags,
7116 p, size,
7117 out, outsize,
7118 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007119 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007120 if (outsize <= 0)
7121 goto error;
7122 if (pusedDefaultChar && *pusedDefaultChar)
7123 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007124 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007125
Victor Stinner3a50e702011-10-18 21:21:00 +02007126error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007127 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007128 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7129 return -2;
7130 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007131 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007132}
7133
Victor Stinner3a50e702011-10-18 21:21:00 +02007134/*
7135 * Encode a Unicode string to a Windows code page into a byte string using a
7136 * error handler.
7137 *
7138 * Returns consumed characters if succeed, or raise a WindowsError and returns
7139 * -1 on other error.
7140 */
7141static int
7142encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007143 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007144 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007145{
Victor Stinner3a50e702011-10-18 21:21:00 +02007146 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007147 Py_ssize_t pos = unicode_offset;
7148 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007149 /* Ideally, we should get reason from FormatMessage. This is the Windows
7150 2000 English version of the message. */
7151 const char *reason = "invalid character";
7152 /* 4=maximum length of a UTF-8 sequence */
7153 char buffer[4];
7154 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7155 Py_ssize_t outsize;
7156 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007157 PyObject *errorHandler = NULL;
7158 PyObject *exc = NULL;
7159 PyObject *encoding_obj = NULL;
7160 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007161 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007162 PyObject *rep;
7163 int ret = -1;
7164
7165 assert(insize > 0);
7166
7167 encoding = code_page_name(code_page, &encoding_obj);
7168 if (encoding == NULL)
7169 return -1;
7170
7171 if (errors == NULL || strcmp(errors, "strict") == 0) {
7172 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7173 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007174 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007175 if (exc != NULL) {
7176 PyCodec_StrictErrors(exc);
7177 Py_DECREF(exc);
7178 }
7179 Py_XDECREF(encoding_obj);
7180 return -1;
7181 }
7182
7183 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7184 pusedDefaultChar = &usedDefaultChar;
7185 else
7186 pusedDefaultChar = NULL;
7187
7188 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7189 PyErr_NoMemory();
7190 goto error;
7191 }
7192 outsize = insize * Py_ARRAY_LENGTH(buffer);
7193
7194 if (*outbytes == NULL) {
7195 /* Create string object */
7196 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7197 if (*outbytes == NULL)
7198 goto error;
7199 out = PyBytes_AS_STRING(*outbytes);
7200 }
7201 else {
7202 /* Extend string object */
7203 Py_ssize_t n = PyBytes_Size(*outbytes);
7204 if (n > PY_SSIZE_T_MAX - outsize) {
7205 PyErr_NoMemory();
7206 goto error;
7207 }
7208 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7209 goto error;
7210 out = PyBytes_AS_STRING(*outbytes) + n;
7211 }
7212
7213 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007214 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007215 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007216 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7217 wchar_t chars[2];
7218 int charsize;
7219 if (ch < 0x10000) {
7220 chars[0] = (wchar_t)ch;
7221 charsize = 1;
7222 }
7223 else {
7224 ch -= 0x10000;
7225 chars[0] = 0xd800 + (ch >> 10);
7226 chars[1] = 0xdc00 + (ch & 0x3ff);
7227 charsize = 2;
7228 }
7229
Victor Stinner3a50e702011-10-18 21:21:00 +02007230 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007231 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007232 buffer, Py_ARRAY_LENGTH(buffer),
7233 NULL, pusedDefaultChar);
7234 if (outsize > 0) {
7235 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7236 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007237 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007238 memcpy(out, buffer, outsize);
7239 out += outsize;
7240 continue;
7241 }
7242 }
7243 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7244 PyErr_SetFromWindowsErr(0);
7245 goto error;
7246 }
7247
Victor Stinner3a50e702011-10-18 21:21:00 +02007248 rep = unicode_encode_call_errorhandler(
7249 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007250 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007251 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007252 if (rep == NULL)
7253 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007254 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007255
7256 if (PyBytes_Check(rep)) {
7257 outsize = PyBytes_GET_SIZE(rep);
7258 if (outsize != 1) {
7259 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7260 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7261 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7262 Py_DECREF(rep);
7263 goto error;
7264 }
7265 out = PyBytes_AS_STRING(*outbytes) + offset;
7266 }
7267 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7268 out += outsize;
7269 }
7270 else {
7271 Py_ssize_t i;
7272 enum PyUnicode_Kind kind;
7273 void *data;
7274
Benjamin Petersonbac79492012-01-14 13:34:47 -05007275 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007276 Py_DECREF(rep);
7277 goto error;
7278 }
7279
7280 outsize = PyUnicode_GET_LENGTH(rep);
7281 if (outsize != 1) {
7282 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7283 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7284 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7285 Py_DECREF(rep);
7286 goto error;
7287 }
7288 out = PyBytes_AS_STRING(*outbytes) + offset;
7289 }
7290 kind = PyUnicode_KIND(rep);
7291 data = PyUnicode_DATA(rep);
7292 for (i=0; i < outsize; i++) {
7293 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7294 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007295 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007296 encoding, unicode,
7297 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007298 "unable to encode error handler result to ASCII");
7299 Py_DECREF(rep);
7300 goto error;
7301 }
7302 *out = (unsigned char)ch;
7303 out++;
7304 }
7305 }
7306 Py_DECREF(rep);
7307 }
7308 /* write a NUL byte */
7309 *out = 0;
7310 outsize = out - PyBytes_AS_STRING(*outbytes);
7311 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7312 if (_PyBytes_Resize(outbytes, outsize) < 0)
7313 goto error;
7314 ret = 0;
7315
7316error:
7317 Py_XDECREF(encoding_obj);
7318 Py_XDECREF(errorHandler);
7319 Py_XDECREF(exc);
7320 return ret;
7321}
7322
Victor Stinner3a50e702011-10-18 21:21:00 +02007323static PyObject *
7324encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007325 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007326 const char *errors)
7327{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007328 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007329 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007330 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007331 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007332
Benjamin Petersonbac79492012-01-14 13:34:47 -05007333 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007334 return NULL;
7335 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007336
Victor Stinner3a50e702011-10-18 21:21:00 +02007337 if (code_page < 0) {
7338 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7339 return NULL;
7340 }
7341
Martin v. Löwis3d325192011-11-04 18:23:06 +01007342 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007343 return PyBytes_FromStringAndSize(NULL, 0);
7344
Victor Stinner7581cef2011-11-03 22:32:33 +01007345 offset = 0;
7346 do
7347 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007348#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007349 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007350 chunks. */
7351 if (len > INT_MAX/2) {
7352 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007353 done = 0;
7354 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007355 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007356#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007357 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007358 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007359 done = 1;
7360 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007361
Victor Stinner76a31a62011-11-04 00:05:13 +01007362 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007363 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007364 errors);
7365 if (ret == -2)
7366 ret = encode_code_page_errors(code_page, &outbytes,
7367 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007368 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007369 if (ret < 0) {
7370 Py_XDECREF(outbytes);
7371 return NULL;
7372 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007373
Victor Stinner7581cef2011-11-03 22:32:33 +01007374 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007375 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007376 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007377
Victor Stinner3a50e702011-10-18 21:21:00 +02007378 return outbytes;
7379}
7380
7381PyObject *
7382PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7383 Py_ssize_t size,
7384 const char *errors)
7385{
Victor Stinner7581cef2011-11-03 22:32:33 +01007386 PyObject *unicode, *res;
7387 unicode = PyUnicode_FromUnicode(p, size);
7388 if (unicode == NULL)
7389 return NULL;
7390 res = encode_code_page(CP_ACP, unicode, errors);
7391 Py_DECREF(unicode);
7392 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007393}
7394
7395PyObject *
7396PyUnicode_EncodeCodePage(int code_page,
7397 PyObject *unicode,
7398 const char *errors)
7399{
Victor Stinner7581cef2011-11-03 22:32:33 +01007400 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007401}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007402
Alexander Belopolsky40018472011-02-26 01:02:56 +00007403PyObject *
7404PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007405{
7406 if (!PyUnicode_Check(unicode)) {
7407 PyErr_BadArgument();
7408 return NULL;
7409 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007410 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007411}
7412
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007413#undef NEED_RETRY
7414
Victor Stinner99b95382011-07-04 14:23:54 +02007415#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007416
Guido van Rossumd57fd912000-03-10 22:53:23 +00007417/* --- Character Mapping Codec -------------------------------------------- */
7418
Alexander Belopolsky40018472011-02-26 01:02:56 +00007419PyObject *
7420PyUnicode_DecodeCharmap(const char *s,
7421 Py_ssize_t size,
7422 PyObject *mapping,
7423 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007424{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007425 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007426 Py_ssize_t startinpos;
7427 Py_ssize_t endinpos;
7428 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007429 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007430 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007431 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007432 PyObject *errorHandler = NULL;
7433 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007434
Guido van Rossumd57fd912000-03-10 22:53:23 +00007435 /* Default to Latin-1 */
7436 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007437 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007438
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007439 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007440 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007441 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007442 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007443 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007444 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007445 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007446 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007447 Py_ssize_t maplen;
7448 enum PyUnicode_Kind kind;
7449 void *data;
7450 Py_UCS4 x;
7451
Benjamin Petersonbac79492012-01-14 13:34:47 -05007452 if (PyUnicode_READY(mapping) == -1)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007453 return NULL;
7454
7455 maplen = PyUnicode_GET_LENGTH(mapping);
7456 data = PyUnicode_DATA(mapping);
7457 kind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007458 while (s < e) {
7459 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007460
Benjamin Peterson29060642009-01-31 22:14:21 +00007461 if (ch < maplen)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007462 x = PyUnicode_READ(kind, data, ch);
7463 else
7464 x = 0xfffe; /* invalid value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007465
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007466 if (x == 0xfffe)
7467 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007468 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007469 startinpos = s-starts;
7470 endinpos = startinpos+1;
7471 if (unicode_decode_call_errorhandler(
7472 errors, &errorHandler,
7473 "charmap", "character maps to <undefined>",
7474 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007475 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007476 goto onError;
7477 }
7478 continue;
7479 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007480
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007481 if (unicode_putchar(&v, &outpos, x) < 0)
7482 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007483 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007484 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007485 }
7486 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007487 while (s < e) {
7488 unsigned char ch = *s;
7489 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007490
Benjamin Peterson29060642009-01-31 22:14:21 +00007491 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7492 w = PyLong_FromLong((long)ch);
7493 if (w == NULL)
7494 goto onError;
7495 x = PyObject_GetItem(mapping, w);
7496 Py_DECREF(w);
7497 if (x == NULL) {
7498 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7499 /* No mapping found means: mapping is undefined. */
7500 PyErr_Clear();
7501 x = Py_None;
7502 Py_INCREF(x);
7503 } else
7504 goto onError;
7505 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007506
Benjamin Peterson29060642009-01-31 22:14:21 +00007507 /* Apply mapping */
7508 if (PyLong_Check(x)) {
7509 long value = PyLong_AS_LONG(x);
7510 if (value < 0 || value > 65535) {
7511 PyErr_SetString(PyExc_TypeError,
7512 "character mapping must be in range(65536)");
7513 Py_DECREF(x);
7514 goto onError;
7515 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007516 if (unicode_putchar(&v, &outpos, value) < 0)
7517 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007518 }
7519 else if (x == Py_None) {
7520 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007521 startinpos = s-starts;
7522 endinpos = startinpos+1;
7523 if (unicode_decode_call_errorhandler(
7524 errors, &errorHandler,
7525 "charmap", "character maps to <undefined>",
7526 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007527 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007528 Py_DECREF(x);
7529 goto onError;
7530 }
7531 Py_DECREF(x);
7532 continue;
7533 }
7534 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007535 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007536
Benjamin Petersonbac79492012-01-14 13:34:47 -05007537 if (PyUnicode_READY(x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007538 goto onError;
7539 targetsize = PyUnicode_GET_LENGTH(x);
7540
7541 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007542 /* 1-1 mapping */
Victor Stinner62aa4d02011-11-09 00:03:45 +01007543 if (unicode_putchar(&v, &outpos,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007544 PyUnicode_READ_CHAR(x, 0)) < 0)
7545 goto onError;
7546 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007547 else if (targetsize > 1) {
7548 /* 1-n mapping */
7549 if (targetsize > extrachars) {
7550 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007551 Py_ssize_t needed = (targetsize - extrachars) + \
7552 (targetsize << 2);
7553 extrachars += needed;
7554 /* XXX overflow detection missing */
Victor Stinner16e6a802011-12-12 13:24:15 +01007555 if (unicode_resize(&v,
7556 PyUnicode_GET_LENGTH(v) + needed) < 0)
7557 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007558 Py_DECREF(x);
7559 goto onError;
7560 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007561 }
Victor Stinner1b487b42012-05-03 12:29:04 +02007562 if (unicode_widen(&v, outpos, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007563 goto onError;
7564 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7565 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00007566 extrachars -= targetsize;
7567 }
7568 /* 1-0 mapping: skip the character */
7569 }
7570 else {
7571 /* wrong return value */
7572 PyErr_SetString(PyExc_TypeError,
7573 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007574 Py_DECREF(x);
7575 goto onError;
7576 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007577 Py_DECREF(x);
7578 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007579 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007580 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007581 if (unicode_resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01007582 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007583 Py_XDECREF(errorHandler);
7584 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007585 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00007586
Benjamin Peterson29060642009-01-31 22:14:21 +00007587 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007588 Py_XDECREF(errorHandler);
7589 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007590 Py_XDECREF(v);
7591 return NULL;
7592}
7593
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007594/* Charmap encoding: the lookup table */
7595
Alexander Belopolsky40018472011-02-26 01:02:56 +00007596struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007597 PyObject_HEAD
7598 unsigned char level1[32];
7599 int count2, count3;
7600 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007601};
7602
7603static PyObject*
7604encoding_map_size(PyObject *obj, PyObject* args)
7605{
7606 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007607 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007608 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007609}
7610
7611static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007612 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007613 PyDoc_STR("Return the size (in bytes) of this object") },
7614 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007615};
7616
7617static void
7618encoding_map_dealloc(PyObject* o)
7619{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007620 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007621}
7622
7623static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007624 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007625 "EncodingMap", /*tp_name*/
7626 sizeof(struct encoding_map), /*tp_basicsize*/
7627 0, /*tp_itemsize*/
7628 /* methods */
7629 encoding_map_dealloc, /*tp_dealloc*/
7630 0, /*tp_print*/
7631 0, /*tp_getattr*/
7632 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007633 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007634 0, /*tp_repr*/
7635 0, /*tp_as_number*/
7636 0, /*tp_as_sequence*/
7637 0, /*tp_as_mapping*/
7638 0, /*tp_hash*/
7639 0, /*tp_call*/
7640 0, /*tp_str*/
7641 0, /*tp_getattro*/
7642 0, /*tp_setattro*/
7643 0, /*tp_as_buffer*/
7644 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7645 0, /*tp_doc*/
7646 0, /*tp_traverse*/
7647 0, /*tp_clear*/
7648 0, /*tp_richcompare*/
7649 0, /*tp_weaklistoffset*/
7650 0, /*tp_iter*/
7651 0, /*tp_iternext*/
7652 encoding_map_methods, /*tp_methods*/
7653 0, /*tp_members*/
7654 0, /*tp_getset*/
7655 0, /*tp_base*/
7656 0, /*tp_dict*/
7657 0, /*tp_descr_get*/
7658 0, /*tp_descr_set*/
7659 0, /*tp_dictoffset*/
7660 0, /*tp_init*/
7661 0, /*tp_alloc*/
7662 0, /*tp_new*/
7663 0, /*tp_free*/
7664 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007665};
7666
7667PyObject*
7668PyUnicode_BuildEncodingMap(PyObject* string)
7669{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007670 PyObject *result;
7671 struct encoding_map *mresult;
7672 int i;
7673 int need_dict = 0;
7674 unsigned char level1[32];
7675 unsigned char level2[512];
7676 unsigned char *mlevel1, *mlevel2, *mlevel3;
7677 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007678 int kind;
7679 void *data;
7680 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007681
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007682 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007683 PyErr_BadArgument();
7684 return NULL;
7685 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007686 kind = PyUnicode_KIND(string);
7687 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007688 memset(level1, 0xFF, sizeof level1);
7689 memset(level2, 0xFF, sizeof level2);
7690
7691 /* If there isn't a one-to-one mapping of NULL to \0,
7692 or if there are non-BMP characters, we need to use
7693 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007694 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007695 need_dict = 1;
7696 for (i = 1; i < 256; i++) {
7697 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007698 ch = PyUnicode_READ(kind, data, i);
7699 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007700 need_dict = 1;
7701 break;
7702 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007703 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007704 /* unmapped character */
7705 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007706 l1 = ch >> 11;
7707 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007708 if (level1[l1] == 0xFF)
7709 level1[l1] = count2++;
7710 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007711 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007712 }
7713
7714 if (count2 >= 0xFF || count3 >= 0xFF)
7715 need_dict = 1;
7716
7717 if (need_dict) {
7718 PyObject *result = PyDict_New();
7719 PyObject *key, *value;
7720 if (!result)
7721 return NULL;
7722 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007723 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007724 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007725 if (!key || !value)
7726 goto failed1;
7727 if (PyDict_SetItem(result, key, value) == -1)
7728 goto failed1;
7729 Py_DECREF(key);
7730 Py_DECREF(value);
7731 }
7732 return result;
7733 failed1:
7734 Py_XDECREF(key);
7735 Py_XDECREF(value);
7736 Py_DECREF(result);
7737 return NULL;
7738 }
7739
7740 /* Create a three-level trie */
7741 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7742 16*count2 + 128*count3 - 1);
7743 if (!result)
7744 return PyErr_NoMemory();
7745 PyObject_Init(result, &EncodingMapType);
7746 mresult = (struct encoding_map*)result;
7747 mresult->count2 = count2;
7748 mresult->count3 = count3;
7749 mlevel1 = mresult->level1;
7750 mlevel2 = mresult->level23;
7751 mlevel3 = mresult->level23 + 16*count2;
7752 memcpy(mlevel1, level1, 32);
7753 memset(mlevel2, 0xFF, 16*count2);
7754 memset(mlevel3, 0, 128*count3);
7755 count3 = 0;
7756 for (i = 1; i < 256; i++) {
7757 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007758 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007759 /* unmapped character */
7760 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007761 o1 = PyUnicode_READ(kind, data, i)>>11;
7762 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007763 i2 = 16*mlevel1[o1] + o2;
7764 if (mlevel2[i2] == 0xFF)
7765 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007766 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007767 i3 = 128*mlevel2[i2] + o3;
7768 mlevel3[i3] = i;
7769 }
7770 return result;
7771}
7772
7773static int
Victor Stinner22168992011-11-20 17:09:18 +01007774encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007775{
7776 struct encoding_map *map = (struct encoding_map*)mapping;
7777 int l1 = c>>11;
7778 int l2 = (c>>7) & 0xF;
7779 int l3 = c & 0x7F;
7780 int i;
7781
Victor Stinner22168992011-11-20 17:09:18 +01007782 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007783 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007784 if (c == 0)
7785 return 0;
7786 /* level 1*/
7787 i = map->level1[l1];
7788 if (i == 0xFF) {
7789 return -1;
7790 }
7791 /* level 2*/
7792 i = map->level23[16*i+l2];
7793 if (i == 0xFF) {
7794 return -1;
7795 }
7796 /* level 3 */
7797 i = map->level23[16*map->count2 + 128*i + l3];
7798 if (i == 0) {
7799 return -1;
7800 }
7801 return i;
7802}
7803
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007804/* Lookup the character ch in the mapping. If the character
7805 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007806 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007807static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007808charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007809{
Christian Heimes217cfd12007-12-02 14:31:20 +00007810 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007811 PyObject *x;
7812
7813 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007814 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007815 x = PyObject_GetItem(mapping, w);
7816 Py_DECREF(w);
7817 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007818 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7819 /* No mapping found means: mapping is undefined. */
7820 PyErr_Clear();
7821 x = Py_None;
7822 Py_INCREF(x);
7823 return x;
7824 } else
7825 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007826 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007827 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007828 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007829 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007830 long value = PyLong_AS_LONG(x);
7831 if (value < 0 || value > 255) {
7832 PyErr_SetString(PyExc_TypeError,
7833 "character mapping must be in range(256)");
7834 Py_DECREF(x);
7835 return NULL;
7836 }
7837 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007838 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007839 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007840 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007841 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007842 /* wrong return value */
7843 PyErr_Format(PyExc_TypeError,
7844 "character mapping must return integer, bytes or None, not %.400s",
7845 x->ob_type->tp_name);
7846 Py_DECREF(x);
7847 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007848 }
7849}
7850
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007851static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007852charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007853{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007854 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7855 /* exponentially overallocate to minimize reallocations */
7856 if (requiredsize < 2*outsize)
7857 requiredsize = 2*outsize;
7858 if (_PyBytes_Resize(outobj, requiredsize))
7859 return -1;
7860 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007861}
7862
Benjamin Peterson14339b62009-01-31 16:36:08 +00007863typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007864 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007865} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007866/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007867 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007868 space is available. Return a new reference to the object that
7869 was put in the output buffer, or Py_None, if the mapping was undefined
7870 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007871 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007872static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01007873charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007874 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007875{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007876 PyObject *rep;
7877 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007878 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007879
Christian Heimes90aa7642007-12-19 02:45:37 +00007880 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007881 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007882 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007883 if (res == -1)
7884 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007885 if (outsize<requiredsize)
7886 if (charmapencode_resize(outobj, outpos, requiredsize))
7887 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007888 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007889 outstart[(*outpos)++] = (char)res;
7890 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007891 }
7892
7893 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007894 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007895 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007896 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007897 Py_DECREF(rep);
7898 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007899 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007900 if (PyLong_Check(rep)) {
7901 Py_ssize_t requiredsize = *outpos+1;
7902 if (outsize<requiredsize)
7903 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7904 Py_DECREF(rep);
7905 return enc_EXCEPTION;
7906 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007907 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007908 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007909 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007910 else {
7911 const char *repchars = PyBytes_AS_STRING(rep);
7912 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7913 Py_ssize_t requiredsize = *outpos+repsize;
7914 if (outsize<requiredsize)
7915 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7916 Py_DECREF(rep);
7917 return enc_EXCEPTION;
7918 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007919 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007920 memcpy(outstart + *outpos, repchars, repsize);
7921 *outpos += repsize;
7922 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007923 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007924 Py_DECREF(rep);
7925 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007926}
7927
7928/* handle an error in PyUnicode_EncodeCharmap
7929 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007930static int
7931charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007932 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007933 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007934 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007935 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007936{
7937 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007938 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007939 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007940 enum PyUnicode_Kind kind;
7941 void *data;
7942 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007943 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007944 Py_ssize_t collstartpos = *inpos;
7945 Py_ssize_t collendpos = *inpos+1;
7946 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007947 char *encoding = "charmap";
7948 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007949 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007950 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05007951 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007952
Benjamin Petersonbac79492012-01-14 13:34:47 -05007953 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007954 return -1;
7955 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007956 /* find all unencodable characters */
7957 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007958 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007959 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007960 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05007961 val = encoding_map_lookup(ch, mapping);
7962 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007963 break;
7964 ++collendpos;
7965 continue;
7966 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007967
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007968 ch = PyUnicode_READ_CHAR(unicode, collendpos);
7969 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007970 if (rep==NULL)
7971 return -1;
7972 else if (rep!=Py_None) {
7973 Py_DECREF(rep);
7974 break;
7975 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007976 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007977 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007978 }
7979 /* cache callback name lookup
7980 * (if not done yet, i.e. it's the first error) */
7981 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007982 if ((errors==NULL) || (!strcmp(errors, "strict")))
7983 *known_errorHandler = 1;
7984 else if (!strcmp(errors, "replace"))
7985 *known_errorHandler = 2;
7986 else if (!strcmp(errors, "ignore"))
7987 *known_errorHandler = 3;
7988 else if (!strcmp(errors, "xmlcharrefreplace"))
7989 *known_errorHandler = 4;
7990 else
7991 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007992 }
7993 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007994 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007995 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007996 return -1;
7997 case 2: /* replace */
7998 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007999 x = charmapencode_output('?', mapping, res, respos);
8000 if (x==enc_EXCEPTION) {
8001 return -1;
8002 }
8003 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008004 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008005 return -1;
8006 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008007 }
8008 /* fall through */
8009 case 3: /* ignore */
8010 *inpos = collendpos;
8011 break;
8012 case 4: /* xmlcharrefreplace */
8013 /* generate replacement (temporarily (mis)uses p) */
8014 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008015 char buffer[2+29+1+1];
8016 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008017 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008018 for (cp = buffer; *cp; ++cp) {
8019 x = charmapencode_output(*cp, mapping, res, respos);
8020 if (x==enc_EXCEPTION)
8021 return -1;
8022 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008023 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008024 return -1;
8025 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008026 }
8027 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008028 *inpos = collendpos;
8029 break;
8030 default:
8031 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008032 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008033 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008034 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008035 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008036 if (PyBytes_Check(repunicode)) {
8037 /* Directly copy bytes result to output. */
8038 Py_ssize_t outsize = PyBytes_Size(*res);
8039 Py_ssize_t requiredsize;
8040 repsize = PyBytes_Size(repunicode);
8041 requiredsize = *respos + repsize;
8042 if (requiredsize > outsize)
8043 /* Make room for all additional bytes. */
8044 if (charmapencode_resize(res, respos, requiredsize)) {
8045 Py_DECREF(repunicode);
8046 return -1;
8047 }
8048 memcpy(PyBytes_AsString(*res) + *respos,
8049 PyBytes_AsString(repunicode), repsize);
8050 *respos += repsize;
8051 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008052 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008053 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008054 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008055 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008056 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008057 Py_DECREF(repunicode);
8058 return -1;
8059 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008060 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008061 data = PyUnicode_DATA(repunicode);
8062 kind = PyUnicode_KIND(repunicode);
8063 for (index = 0; index < repsize; index++) {
8064 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8065 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008066 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008067 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008068 return -1;
8069 }
8070 else if (x==enc_FAILED) {
8071 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008072 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008073 return -1;
8074 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008075 }
8076 *inpos = newpos;
8077 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008078 }
8079 return 0;
8080}
8081
Alexander Belopolsky40018472011-02-26 01:02:56 +00008082PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008083_PyUnicode_EncodeCharmap(PyObject *unicode,
8084 PyObject *mapping,
8085 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008086{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008087 /* output object */
8088 PyObject *res = NULL;
8089 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008090 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008091 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008092 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008093 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008094 PyObject *errorHandler = NULL;
8095 PyObject *exc = NULL;
8096 /* the following variable is used for caching string comparisons
8097 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8098 * 3=ignore, 4=xmlcharrefreplace */
8099 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008100
Benjamin Petersonbac79492012-01-14 13:34:47 -05008101 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008102 return NULL;
8103 size = PyUnicode_GET_LENGTH(unicode);
8104
Guido van Rossumd57fd912000-03-10 22:53:23 +00008105 /* Default to Latin-1 */
8106 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008107 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008108
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008109 /* allocate enough for a simple encoding without
8110 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008111 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008112 if (res == NULL)
8113 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008114 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008115 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008116
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008117 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008118 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008119 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008120 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008121 if (x==enc_EXCEPTION) /* error */
8122 goto onError;
8123 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008124 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008125 &exc,
8126 &known_errorHandler, &errorHandler, errors,
8127 &res, &respos)) {
8128 goto onError;
8129 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008130 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008131 else
8132 /* done with this character => adjust input position */
8133 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008134 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008135
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008136 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008137 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008138 if (_PyBytes_Resize(&res, respos) < 0)
8139 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008140
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008141 Py_XDECREF(exc);
8142 Py_XDECREF(errorHandler);
8143 return res;
8144
Benjamin Peterson29060642009-01-31 22:14:21 +00008145 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008146 Py_XDECREF(res);
8147 Py_XDECREF(exc);
8148 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008149 return NULL;
8150}
8151
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008152/* Deprecated */
8153PyObject *
8154PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8155 Py_ssize_t size,
8156 PyObject *mapping,
8157 const char *errors)
8158{
8159 PyObject *result;
8160 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8161 if (unicode == NULL)
8162 return NULL;
8163 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8164 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008165 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008166}
8167
Alexander Belopolsky40018472011-02-26 01:02:56 +00008168PyObject *
8169PyUnicode_AsCharmapString(PyObject *unicode,
8170 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008171{
8172 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008173 PyErr_BadArgument();
8174 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008175 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008176 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008177}
8178
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008179/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008180static void
8181make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008182 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008183 Py_ssize_t startpos, Py_ssize_t endpos,
8184 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008185{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008186 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008187 *exceptionObject = _PyUnicodeTranslateError_Create(
8188 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008189 }
8190 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008191 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8192 goto onError;
8193 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8194 goto onError;
8195 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8196 goto onError;
8197 return;
8198 onError:
8199 Py_DECREF(*exceptionObject);
8200 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008201 }
8202}
8203
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008204/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008205static void
8206raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008207 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008208 Py_ssize_t startpos, Py_ssize_t endpos,
8209 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008210{
8211 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008212 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008213 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008214 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008215}
8216
8217/* error handling callback helper:
8218 build arguments, call the callback and check the arguments,
8219 put the result into newpos and return the replacement string, which
8220 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008221static PyObject *
8222unicode_translate_call_errorhandler(const char *errors,
8223 PyObject **errorHandler,
8224 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008225 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008226 Py_ssize_t startpos, Py_ssize_t endpos,
8227 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008228{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008229 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008230
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008231 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008232 PyObject *restuple;
8233 PyObject *resunicode;
8234
8235 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008236 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008237 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008238 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008239 }
8240
8241 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008242 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008243 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008244 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008245
8246 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008247 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008248 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008249 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008250 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008251 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008252 Py_DECREF(restuple);
8253 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008254 }
8255 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008256 &resunicode, &i_newpos)) {
8257 Py_DECREF(restuple);
8258 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008259 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008260 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008261 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008262 else
8263 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008264 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008265 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8266 Py_DECREF(restuple);
8267 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008268 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008269 Py_INCREF(resunicode);
8270 Py_DECREF(restuple);
8271 return resunicode;
8272}
8273
8274/* Lookup the character ch in the mapping and put the result in result,
8275 which must be decrefed by the caller.
8276 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008277static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008278charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008279{
Christian Heimes217cfd12007-12-02 14:31:20 +00008280 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008281 PyObject *x;
8282
8283 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008284 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008285 x = PyObject_GetItem(mapping, w);
8286 Py_DECREF(w);
8287 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008288 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8289 /* No mapping found means: use 1:1 mapping. */
8290 PyErr_Clear();
8291 *result = NULL;
8292 return 0;
8293 } else
8294 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008295 }
8296 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008297 *result = x;
8298 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008299 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008300 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008301 long value = PyLong_AS_LONG(x);
8302 long max = PyUnicode_GetMax();
8303 if (value < 0 || value > max) {
8304 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008305 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008306 Py_DECREF(x);
8307 return -1;
8308 }
8309 *result = x;
8310 return 0;
8311 }
8312 else if (PyUnicode_Check(x)) {
8313 *result = x;
8314 return 0;
8315 }
8316 else {
8317 /* wrong return value */
8318 PyErr_SetString(PyExc_TypeError,
8319 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008320 Py_DECREF(x);
8321 return -1;
8322 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008323}
8324/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008325 if not reallocate and adjust various state variables.
8326 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008327static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008328charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008329 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008330{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008331 Py_ssize_t oldsize = *psize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008332 Py_UCS4 *new_outobj;
Walter Dörwald4894c302003-10-24 14:25:28 +00008333 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008334 /* exponentially overallocate to minimize reallocations */
8335 if (requiredsize < 2 * oldsize)
8336 requiredsize = 2 * oldsize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008337 new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8338 if (new_outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008339 return -1;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008340 *outobj = new_outobj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008341 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008342 }
8343 return 0;
8344}
8345/* lookup the character, put the result in the output string and adjust
8346 various state variables. Return a new reference to the object that
8347 was put in the output buffer in *result, or Py_None, if the mapping was
8348 undefined (in which case no character was written).
8349 The called must decref result.
8350 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008351static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008352charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8353 PyObject *mapping, Py_UCS4 **output,
8354 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008355 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008356{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008357 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8358 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008359 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008360 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008361 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008362 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008363 }
8364 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008365 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008366 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008367 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008368 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008369 }
8370 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008371 Py_ssize_t repsize;
8372 if (PyUnicode_READY(*res) == -1)
8373 return -1;
8374 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008375 if (repsize==1) {
8376 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008377 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008378 }
8379 else if (repsize!=0) {
8380 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008381 Py_ssize_t requiredsize = *opos +
8382 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008383 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008384 Py_ssize_t i;
8385 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008386 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008387 for(i = 0; i < repsize; i++)
8388 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008389 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008390 }
8391 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008392 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008393 return 0;
8394}
8395
Alexander Belopolsky40018472011-02-26 01:02:56 +00008396PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008397_PyUnicode_TranslateCharmap(PyObject *input,
8398 PyObject *mapping,
8399 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008400{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008401 /* input object */
8402 char *idata;
8403 Py_ssize_t size, i;
8404 int kind;
8405 /* output buffer */
8406 Py_UCS4 *output = NULL;
8407 Py_ssize_t osize;
8408 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008409 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008410 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008411 char *reason = "character maps to <undefined>";
8412 PyObject *errorHandler = NULL;
8413 PyObject *exc = NULL;
8414 /* the following variable is used for caching string comparisons
8415 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8416 * 3=ignore, 4=xmlcharrefreplace */
8417 int known_errorHandler = -1;
8418
Guido van Rossumd57fd912000-03-10 22:53:23 +00008419 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008420 PyErr_BadArgument();
8421 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008422 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008423
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008424 if (PyUnicode_READY(input) == -1)
8425 return NULL;
8426 idata = (char*)PyUnicode_DATA(input);
8427 kind = PyUnicode_KIND(input);
8428 size = PyUnicode_GET_LENGTH(input);
8429 i = 0;
8430
8431 if (size == 0) {
8432 Py_INCREF(input);
8433 return input;
8434 }
8435
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008436 /* allocate enough for a simple 1:1 translation without
8437 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008438 osize = size;
8439 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8440 opos = 0;
8441 if (output == NULL) {
8442 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008443 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008444 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008445
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008446 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008447 /* try to encode it */
8448 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008449 if (charmaptranslate_output(input, i, mapping,
8450 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008451 Py_XDECREF(x);
8452 goto onError;
8453 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008454 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008455 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008456 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008457 else { /* untranslatable character */
8458 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8459 Py_ssize_t repsize;
8460 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008461 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008462 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008463 Py_ssize_t collstart = i;
8464 Py_ssize_t collend = i+1;
8465 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008466
Benjamin Peterson29060642009-01-31 22:14:21 +00008467 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008468 while (collend < size) {
8469 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008470 goto onError;
8471 Py_XDECREF(x);
8472 if (x!=Py_None)
8473 break;
8474 ++collend;
8475 }
8476 /* cache callback name lookup
8477 * (if not done yet, i.e. it's the first error) */
8478 if (known_errorHandler==-1) {
8479 if ((errors==NULL) || (!strcmp(errors, "strict")))
8480 known_errorHandler = 1;
8481 else if (!strcmp(errors, "replace"))
8482 known_errorHandler = 2;
8483 else if (!strcmp(errors, "ignore"))
8484 known_errorHandler = 3;
8485 else if (!strcmp(errors, "xmlcharrefreplace"))
8486 known_errorHandler = 4;
8487 else
8488 known_errorHandler = 0;
8489 }
8490 switch (known_errorHandler) {
8491 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008492 raise_translate_exception(&exc, input, collstart,
8493 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008494 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008495 case 2: /* replace */
8496 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008497 for (coll = collstart; coll<collend; coll++)
8498 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008499 /* fall through */
8500 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008501 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008502 break;
8503 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008504 /* generate replacement (temporarily (mis)uses i) */
8505 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008506 char buffer[2+29+1+1];
8507 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008508 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8509 if (charmaptranslate_makespace(&output, &osize,
8510 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008511 goto onError;
8512 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008513 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008514 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008515 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008516 break;
8517 default:
8518 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008519 reason, input, &exc,
8520 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008521 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008522 goto onError;
Benjamin Peterson9ca3ffa2012-01-01 16:04:29 -06008523 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008524 Py_DECREF(repunicode);
8525 goto onError;
8526 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008527 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008528 repsize = PyUnicode_GET_LENGTH(repunicode);
8529 if (charmaptranslate_makespace(&output, &osize,
8530 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008531 Py_DECREF(repunicode);
8532 goto onError;
8533 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008534 for (uni2 = 0; repsize-->0; ++uni2)
8535 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8536 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008537 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008538 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008539 }
8540 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008541 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8542 if (!res)
8543 goto onError;
8544 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008545 Py_XDECREF(exc);
8546 Py_XDECREF(errorHandler);
8547 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008548
Benjamin Peterson29060642009-01-31 22:14:21 +00008549 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008550 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008551 Py_XDECREF(exc);
8552 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008553 return NULL;
8554}
8555
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008556/* Deprecated. Use PyUnicode_Translate instead. */
8557PyObject *
8558PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8559 Py_ssize_t size,
8560 PyObject *mapping,
8561 const char *errors)
8562{
8563 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8564 if (!unicode)
8565 return NULL;
8566 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8567}
8568
Alexander Belopolsky40018472011-02-26 01:02:56 +00008569PyObject *
8570PyUnicode_Translate(PyObject *str,
8571 PyObject *mapping,
8572 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008573{
8574 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008575
Guido van Rossumd57fd912000-03-10 22:53:23 +00008576 str = PyUnicode_FromObject(str);
8577 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008578 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008579 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008580 Py_DECREF(str);
8581 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008582
Benjamin Peterson29060642009-01-31 22:14:21 +00008583 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008584 Py_XDECREF(str);
8585 return NULL;
8586}
Tim Petersced69f82003-09-16 20:30:58 +00008587
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008588static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008589fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008590{
8591 /* No need to call PyUnicode_READY(self) because this function is only
8592 called as a callback from fixup() which does it already. */
8593 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8594 const int kind = PyUnicode_KIND(self);
8595 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02008596 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008597 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008598 Py_ssize_t i;
8599
8600 for (i = 0; i < len; ++i) {
8601 ch = PyUnicode_READ(kind, data, i);
8602 fixed = 0;
8603 if (ch > 127) {
8604 if (Py_UNICODE_ISSPACE(ch))
8605 fixed = ' ';
8606 else {
8607 const int decimal = Py_UNICODE_TODECIMAL(ch);
8608 if (decimal >= 0)
8609 fixed = '0' + decimal;
8610 }
8611 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008612 modified = 1;
Victor Stinnere6abb482012-05-02 01:15:40 +02008613 maxchar = MAX_MAXCHAR(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008614 PyUnicode_WRITE(kind, data, i, fixed);
8615 }
Victor Stinnere6abb482012-05-02 01:15:40 +02008616 else
8617 maxchar = MAX_MAXCHAR(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008618 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008619 }
8620
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008621 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008622}
8623
8624PyObject *
8625_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8626{
8627 if (!PyUnicode_Check(unicode)) {
8628 PyErr_BadInternalCall();
8629 return NULL;
8630 }
8631 if (PyUnicode_READY(unicode) == -1)
8632 return NULL;
8633 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8634 /* If the string is already ASCII, just return the same string */
8635 Py_INCREF(unicode);
8636 return unicode;
8637 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008638 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008639}
8640
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008641PyObject *
8642PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8643 Py_ssize_t length)
8644{
Victor Stinnerf0124502011-11-21 23:12:56 +01008645 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008646 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008647 Py_UCS4 maxchar;
8648 enum PyUnicode_Kind kind;
8649 void *data;
8650
Victor Stinner99d7ad02012-02-22 13:37:39 +01008651 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008652 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008653 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008654 if (ch > 127) {
8655 int decimal = Py_UNICODE_TODECIMAL(ch);
8656 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008657 ch = '0' + decimal;
Victor Stinnere6abb482012-05-02 01:15:40 +02008658 maxchar = MAX_MAXCHAR(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008659 }
8660 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008661
8662 /* Copy to a new string */
8663 decimal = PyUnicode_New(length, maxchar);
8664 if (decimal == NULL)
8665 return decimal;
8666 kind = PyUnicode_KIND(decimal);
8667 data = PyUnicode_DATA(decimal);
8668 /* Iterate over code points */
8669 for (i = 0; i < length; i++) {
8670 Py_UNICODE ch = s[i];
8671 if (ch > 127) {
8672 int decimal = Py_UNICODE_TODECIMAL(ch);
8673 if (decimal >= 0)
8674 ch = '0' + decimal;
8675 }
8676 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008677 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008678 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008679}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008680/* --- Decimal Encoder ---------------------------------------------------- */
8681
Alexander Belopolsky40018472011-02-26 01:02:56 +00008682int
8683PyUnicode_EncodeDecimal(Py_UNICODE *s,
8684 Py_ssize_t length,
8685 char *output,
8686 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008687{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008688 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008689 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008690 enum PyUnicode_Kind kind;
8691 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008692
8693 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008694 PyErr_BadArgument();
8695 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008696 }
8697
Victor Stinner42bf7752011-11-21 22:52:58 +01008698 unicode = PyUnicode_FromUnicode(s, length);
8699 if (unicode == NULL)
8700 return -1;
8701
Benjamin Petersonbac79492012-01-14 13:34:47 -05008702 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008703 Py_DECREF(unicode);
8704 return -1;
8705 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008706 kind = PyUnicode_KIND(unicode);
8707 data = PyUnicode_DATA(unicode);
8708
Victor Stinnerb84d7232011-11-22 01:50:07 +01008709 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008710 PyObject *exc;
8711 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008712 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008713 Py_ssize_t startpos;
8714
8715 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008716
Benjamin Peterson29060642009-01-31 22:14:21 +00008717 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008718 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008719 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008720 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008721 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008722 decimal = Py_UNICODE_TODECIMAL(ch);
8723 if (decimal >= 0) {
8724 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008725 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008726 continue;
8727 }
8728 if (0 < ch && ch < 256) {
8729 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008730 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008731 continue;
8732 }
Victor Stinner6345be92011-11-25 20:09:01 +01008733
Victor Stinner42bf7752011-11-21 22:52:58 +01008734 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008735 exc = NULL;
8736 raise_encode_exception(&exc, "decimal", unicode,
8737 startpos, startpos+1,
8738 "invalid decimal Unicode string");
8739 Py_XDECREF(exc);
8740 Py_DECREF(unicode);
8741 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008742 }
8743 /* 0-terminate the output string */
8744 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008745 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008746 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008747}
8748
Guido van Rossumd57fd912000-03-10 22:53:23 +00008749/* --- Helpers ------------------------------------------------------------ */
8750
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008751static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008752any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008753 Py_ssize_t start,
8754 Py_ssize_t end)
8755{
8756 int kind1, kind2, kind;
8757 void *buf1, *buf2;
8758 Py_ssize_t len1, len2, result;
8759
8760 kind1 = PyUnicode_KIND(s1);
8761 kind2 = PyUnicode_KIND(s2);
8762 kind = kind1 > kind2 ? kind1 : kind2;
8763 buf1 = PyUnicode_DATA(s1);
8764 buf2 = PyUnicode_DATA(s2);
8765 if (kind1 != kind)
8766 buf1 = _PyUnicode_AsKind(s1, kind);
8767 if (!buf1)
8768 return -2;
8769 if (kind2 != kind)
8770 buf2 = _PyUnicode_AsKind(s2, kind);
8771 if (!buf2) {
8772 if (kind1 != kind) PyMem_Free(buf1);
8773 return -2;
8774 }
8775 len1 = PyUnicode_GET_LENGTH(s1);
8776 len2 = PyUnicode_GET_LENGTH(s2);
8777
Victor Stinner794d5672011-10-10 03:21:36 +02008778 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008779 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008780 case PyUnicode_1BYTE_KIND:
8781 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8782 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8783 else
8784 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8785 break;
8786 case PyUnicode_2BYTE_KIND:
8787 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8788 break;
8789 case PyUnicode_4BYTE_KIND:
8790 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8791 break;
8792 default:
8793 assert(0); result = -2;
8794 }
8795 }
8796 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008797 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008798 case PyUnicode_1BYTE_KIND:
8799 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8800 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8801 else
8802 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8803 break;
8804 case PyUnicode_2BYTE_KIND:
8805 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8806 break;
8807 case PyUnicode_4BYTE_KIND:
8808 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8809 break;
8810 default:
8811 assert(0); result = -2;
8812 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008813 }
8814
8815 if (kind1 != kind)
8816 PyMem_Free(buf1);
8817 if (kind2 != kind)
8818 PyMem_Free(buf2);
8819
8820 return result;
8821}
8822
8823Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01008824_PyUnicode_InsertThousandsGrouping(
8825 PyObject *unicode, Py_ssize_t index,
8826 Py_ssize_t n_buffer,
8827 void *digits, Py_ssize_t n_digits,
8828 Py_ssize_t min_width,
8829 const char *grouping, PyObject *thousands_sep,
8830 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008831{
Victor Stinner41a863c2012-02-24 00:37:51 +01008832 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008833 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01008834 Py_ssize_t thousands_sep_len;
8835 Py_ssize_t len;
8836
8837 if (unicode != NULL) {
8838 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008839 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01008840 }
8841 else {
8842 kind = PyUnicode_1BYTE_KIND;
8843 data = NULL;
8844 }
8845 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
8846 thousands_sep_data = PyUnicode_DATA(thousands_sep);
8847 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
8848 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01008849 if (thousands_sep_kind < kind) {
8850 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
8851 if (!thousands_sep_data)
8852 return -1;
8853 }
8854 else {
8855 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
8856 if (!data)
8857 return -1;
8858 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008859 }
8860
Benjamin Petersonead6b532011-12-20 17:23:42 -06008861 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008862 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008863 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01008864 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008865 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008866 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008867 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02008868 else
Victor Stinner41a863c2012-02-24 00:37:51 +01008869 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02008870 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008871 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008872 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008873 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008874 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008875 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008876 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008877 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008878 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008879 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008880 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008881 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008882 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008883 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008884 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008885 break;
8886 default:
8887 assert(0);
8888 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008889 }
Victor Stinner90f50d42012-02-24 01:44:47 +01008890 if (unicode != NULL && thousands_sep_kind != kind) {
8891 if (thousands_sep_kind < kind)
8892 PyMem_Free(thousands_sep_data);
8893 else
8894 PyMem_Free(data);
8895 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008896 if (unicode == NULL) {
8897 *maxchar = 127;
8898 if (len != n_digits) {
Victor Stinnere6abb482012-05-02 01:15:40 +02008899 *maxchar = MAX_MAXCHAR(*maxchar,
8900 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01008901 }
8902 }
8903 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008904}
8905
8906
Thomas Wouters477c8d52006-05-27 19:21:47 +00008907/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008908#define ADJUST_INDICES(start, end, len) \
8909 if (end > len) \
8910 end = len; \
8911 else if (end < 0) { \
8912 end += len; \
8913 if (end < 0) \
8914 end = 0; \
8915 } \
8916 if (start < 0) { \
8917 start += len; \
8918 if (start < 0) \
8919 start = 0; \
8920 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008921
Alexander Belopolsky40018472011-02-26 01:02:56 +00008922Py_ssize_t
8923PyUnicode_Count(PyObject *str,
8924 PyObject *substr,
8925 Py_ssize_t start,
8926 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008927{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008928 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008929 PyObject* str_obj;
8930 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008931 int kind1, kind2, kind;
8932 void *buf1 = NULL, *buf2 = NULL;
8933 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008934
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008935 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008936 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008937 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008938 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008939 if (!sub_obj) {
8940 Py_DECREF(str_obj);
8941 return -1;
8942 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06008943 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06008944 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008945 Py_DECREF(str_obj);
8946 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008947 }
Tim Petersced69f82003-09-16 20:30:58 +00008948
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008949 kind1 = PyUnicode_KIND(str_obj);
8950 kind2 = PyUnicode_KIND(sub_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008951 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008952 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008953 buf2 = PyUnicode_DATA(sub_obj);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008954 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +02008955 if (kind2 > kind) {
8956 Py_DECREF(sub_obj);
8957 Py_DECREF(str_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008958 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +02008959 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01008960 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008961 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008962 if (!buf2)
8963 goto onError;
8964 len1 = PyUnicode_GET_LENGTH(str_obj);
8965 len2 = PyUnicode_GET_LENGTH(sub_obj);
8966
8967 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06008968 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008969 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008970 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
8971 result = asciilib_count(
8972 ((Py_UCS1*)buf1) + start, end - start,
8973 buf2, len2, PY_SSIZE_T_MAX
8974 );
8975 else
8976 result = ucs1lib_count(
8977 ((Py_UCS1*)buf1) + start, end - start,
8978 buf2, len2, PY_SSIZE_T_MAX
8979 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008980 break;
8981 case PyUnicode_2BYTE_KIND:
8982 result = ucs2lib_count(
8983 ((Py_UCS2*)buf1) + start, end - start,
8984 buf2, len2, PY_SSIZE_T_MAX
8985 );
8986 break;
8987 case PyUnicode_4BYTE_KIND:
8988 result = ucs4lib_count(
8989 ((Py_UCS4*)buf1) + start, end - start,
8990 buf2, len2, PY_SSIZE_T_MAX
8991 );
8992 break;
8993 default:
8994 assert(0); result = 0;
8995 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008996
8997 Py_DECREF(sub_obj);
8998 Py_DECREF(str_obj);
8999
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009000 if (kind2 != kind)
9001 PyMem_Free(buf2);
9002
Guido van Rossumd57fd912000-03-10 22:53:23 +00009003 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009004 onError:
9005 Py_DECREF(sub_obj);
9006 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009007 if (kind2 != kind && buf2)
9008 PyMem_Free(buf2);
9009 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009010}
9011
Alexander Belopolsky40018472011-02-26 01:02:56 +00009012Py_ssize_t
9013PyUnicode_Find(PyObject *str,
9014 PyObject *sub,
9015 Py_ssize_t start,
9016 Py_ssize_t end,
9017 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009018{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009019 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009020
Guido van Rossumd57fd912000-03-10 22:53:23 +00009021 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009022 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009023 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009024 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009025 if (!sub) {
9026 Py_DECREF(str);
9027 return -2;
9028 }
9029 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9030 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009031 Py_DECREF(str);
9032 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009033 }
Tim Petersced69f82003-09-16 20:30:58 +00009034
Victor Stinner794d5672011-10-10 03:21:36 +02009035 result = any_find_slice(direction,
9036 str, sub, start, end
9037 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009038
Guido van Rossumd57fd912000-03-10 22:53:23 +00009039 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009040 Py_DECREF(sub);
9041
Guido van Rossumd57fd912000-03-10 22:53:23 +00009042 return result;
9043}
9044
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009045Py_ssize_t
9046PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9047 Py_ssize_t start, Py_ssize_t end,
9048 int direction)
9049{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009050 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009051 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009052 if (PyUnicode_READY(str) == -1)
9053 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009054 if (start < 0 || end < 0) {
9055 PyErr_SetString(PyExc_IndexError, "string index out of range");
9056 return -2;
9057 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009058 if (end > PyUnicode_GET_LENGTH(str))
9059 end = PyUnicode_GET_LENGTH(str);
9060 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009061 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9062 kind, end-start, ch, direction);
9063 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009064 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009065 else
9066 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009067}
9068
Alexander Belopolsky40018472011-02-26 01:02:56 +00009069static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009070tailmatch(PyObject *self,
9071 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009072 Py_ssize_t start,
9073 Py_ssize_t end,
9074 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009075{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009076 int kind_self;
9077 int kind_sub;
9078 void *data_self;
9079 void *data_sub;
9080 Py_ssize_t offset;
9081 Py_ssize_t i;
9082 Py_ssize_t end_sub;
9083
9084 if (PyUnicode_READY(self) == -1 ||
9085 PyUnicode_READY(substring) == -1)
9086 return 0;
9087
9088 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009089 return 1;
9090
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009091 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9092 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009093 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009094 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009095
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009096 kind_self = PyUnicode_KIND(self);
9097 data_self = PyUnicode_DATA(self);
9098 kind_sub = PyUnicode_KIND(substring);
9099 data_sub = PyUnicode_DATA(substring);
9100 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9101
9102 if (direction > 0)
9103 offset = end;
9104 else
9105 offset = start;
9106
9107 if (PyUnicode_READ(kind_self, data_self, offset) ==
9108 PyUnicode_READ(kind_sub, data_sub, 0) &&
9109 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9110 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9111 /* If both are of the same kind, memcmp is sufficient */
9112 if (kind_self == kind_sub) {
9113 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009114 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009115 data_sub,
9116 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009117 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009118 }
9119 /* otherwise we have to compare each character by first accesing it */
9120 else {
9121 /* We do not need to compare 0 and len(substring)-1 because
9122 the if statement above ensured already that they are equal
9123 when we end up here. */
9124 // TODO: honor direction and do a forward or backwards search
9125 for (i = 1; i < end_sub; ++i) {
9126 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9127 PyUnicode_READ(kind_sub, data_sub, i))
9128 return 0;
9129 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009130 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009131 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009132 }
9133
9134 return 0;
9135}
9136
Alexander Belopolsky40018472011-02-26 01:02:56 +00009137Py_ssize_t
9138PyUnicode_Tailmatch(PyObject *str,
9139 PyObject *substr,
9140 Py_ssize_t start,
9141 Py_ssize_t end,
9142 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009143{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009144 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009145
Guido van Rossumd57fd912000-03-10 22:53:23 +00009146 str = PyUnicode_FromObject(str);
9147 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009148 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009149 substr = PyUnicode_FromObject(substr);
9150 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009151 Py_DECREF(str);
9152 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009153 }
Tim Petersced69f82003-09-16 20:30:58 +00009154
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009155 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009156 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009157 Py_DECREF(str);
9158 Py_DECREF(substr);
9159 return result;
9160}
9161
Guido van Rossumd57fd912000-03-10 22:53:23 +00009162/* Apply fixfct filter to the Unicode object self and return a
9163 reference to the modified object */
9164
Alexander Belopolsky40018472011-02-26 01:02:56 +00009165static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009166fixup(PyObject *self,
9167 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009168{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009169 PyObject *u;
9170 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009171 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009172
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009173 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009174 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009175 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009176 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009177
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009178 /* fix functions return the new maximum character in a string,
9179 if the kind of the resulting unicode object does not change,
9180 everything is fine. Otherwise we need to change the string kind
9181 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009182 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009183
9184 if (maxchar_new == 0) {
9185 /* no changes */;
9186 if (PyUnicode_CheckExact(self)) {
9187 Py_DECREF(u);
9188 Py_INCREF(self);
9189 return self;
9190 }
9191 else
9192 return u;
9193 }
9194
Victor Stinnere6abb482012-05-02 01:15:40 +02009195 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009196
Victor Stinnereaab6042011-12-11 22:22:39 +01009197 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009198 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009199
9200 /* In case the maximum character changed, we need to
9201 convert the string to the new category. */
9202 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9203 if (v == NULL) {
9204 Py_DECREF(u);
9205 return NULL;
9206 }
9207 if (maxchar_new > maxchar_old) {
9208 /* If the maxchar increased so that the kind changed, not all
9209 characters are representable anymore and we need to fix the
9210 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009211 _PyUnicode_FastCopyCharacters(v, 0,
9212 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009213 maxchar_old = fixfct(v);
9214 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009215 }
9216 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009217 _PyUnicode_FastCopyCharacters(v, 0,
9218 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009219 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009220 Py_DECREF(u);
9221 assert(_PyUnicode_CheckConsistency(v, 1));
9222 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009223}
9224
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009225static PyObject *
9226ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009227{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009228 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9229 char *resdata, *data = PyUnicode_DATA(self);
9230 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009231
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009232 res = PyUnicode_New(len, 127);
9233 if (res == NULL)
9234 return NULL;
9235 resdata = PyUnicode_DATA(res);
9236 if (lower)
9237 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009238 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009239 _Py_bytes_upper(resdata, data, len);
9240 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009241}
9242
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009243static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009244handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009245{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009246 Py_ssize_t j;
9247 int final_sigma;
9248 Py_UCS4 c;
9249 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009250
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009251 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9252
9253 where ! is a negation and \p{xxx} is a character with property xxx.
9254 */
9255 for (j = i - 1; j >= 0; j--) {
9256 c = PyUnicode_READ(kind, data, j);
9257 if (!_PyUnicode_IsCaseIgnorable(c))
9258 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009259 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009260 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9261 if (final_sigma) {
9262 for (j = i + 1; j < length; j++) {
9263 c = PyUnicode_READ(kind, data, j);
9264 if (!_PyUnicode_IsCaseIgnorable(c))
9265 break;
9266 }
9267 final_sigma = j == length || !_PyUnicode_IsCased(c);
9268 }
9269 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009270}
9271
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009272static int
9273lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9274 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009275{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009276 /* Obscure special case. */
9277 if (c == 0x3A3) {
9278 mapped[0] = handle_capital_sigma(kind, data, length, i);
9279 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009280 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009281 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009282}
9283
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009284static Py_ssize_t
9285do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009286{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009287 Py_ssize_t i, k = 0;
9288 int n_res, j;
9289 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009290
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009291 c = PyUnicode_READ(kind, data, 0);
9292 n_res = _PyUnicode_ToUpperFull(c, mapped);
9293 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009294 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009295 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009296 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009297 for (i = 1; i < length; i++) {
9298 c = PyUnicode_READ(kind, data, i);
9299 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9300 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009301 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009302 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009303 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009304 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009305 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009306}
9307
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009308static Py_ssize_t
9309do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9310 Py_ssize_t i, k = 0;
9311
9312 for (i = 0; i < length; i++) {
9313 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9314 int n_res, j;
9315 if (Py_UNICODE_ISUPPER(c)) {
9316 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9317 }
9318 else if (Py_UNICODE_ISLOWER(c)) {
9319 n_res = _PyUnicode_ToUpperFull(c, mapped);
9320 }
9321 else {
9322 n_res = 1;
9323 mapped[0] = c;
9324 }
9325 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009326 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009327 res[k++] = mapped[j];
9328 }
9329 }
9330 return k;
9331}
9332
9333static Py_ssize_t
9334do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9335 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009336{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009337 Py_ssize_t i, k = 0;
9338
9339 for (i = 0; i < length; i++) {
9340 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9341 int n_res, j;
9342 if (lower)
9343 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9344 else
9345 n_res = _PyUnicode_ToUpperFull(c, mapped);
9346 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009347 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009348 res[k++] = mapped[j];
9349 }
9350 }
9351 return k;
9352}
9353
9354static Py_ssize_t
9355do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9356{
9357 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9358}
9359
9360static Py_ssize_t
9361do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9362{
9363 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9364}
9365
Benjamin Petersone51757f2012-01-12 21:10:29 -05009366static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009367do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9368{
9369 Py_ssize_t i, k = 0;
9370
9371 for (i = 0; i < length; i++) {
9372 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9373 Py_UCS4 mapped[3];
9374 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9375 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009376 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009377 res[k++] = mapped[j];
9378 }
9379 }
9380 return k;
9381}
9382
9383static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009384do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9385{
9386 Py_ssize_t i, k = 0;
9387 int previous_is_cased;
9388
9389 previous_is_cased = 0;
9390 for (i = 0; i < length; i++) {
9391 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9392 Py_UCS4 mapped[3];
9393 int n_res, j;
9394
9395 if (previous_is_cased)
9396 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9397 else
9398 n_res = _PyUnicode_ToTitleFull(c, mapped);
9399
9400 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009401 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009402 res[k++] = mapped[j];
9403 }
9404
9405 previous_is_cased = _PyUnicode_IsCased(c);
9406 }
9407 return k;
9408}
9409
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009410static PyObject *
9411case_operation(PyObject *self,
9412 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9413{
9414 PyObject *res = NULL;
9415 Py_ssize_t length, newlength = 0;
9416 int kind, outkind;
9417 void *data, *outdata;
9418 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9419
Benjamin Petersoneea48462012-01-16 14:28:50 -05009420 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009421
9422 kind = PyUnicode_KIND(self);
9423 data = PyUnicode_DATA(self);
9424 length = PyUnicode_GET_LENGTH(self);
9425 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9426 if (tmp == NULL)
9427 return PyErr_NoMemory();
9428 newlength = perform(kind, data, length, tmp, &maxchar);
9429 res = PyUnicode_New(newlength, maxchar);
9430 if (res == NULL)
9431 goto leave;
9432 tmpend = tmp + newlength;
9433 outdata = PyUnicode_DATA(res);
9434 outkind = PyUnicode_KIND(res);
9435 switch (outkind) {
9436 case PyUnicode_1BYTE_KIND:
9437 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9438 break;
9439 case PyUnicode_2BYTE_KIND:
9440 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9441 break;
9442 case PyUnicode_4BYTE_KIND:
9443 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9444 break;
9445 default:
9446 assert(0);
9447 break;
9448 }
9449 leave:
9450 PyMem_FREE(tmp);
9451 return res;
9452}
9453
Tim Peters8ce9f162004-08-27 01:49:32 +00009454PyObject *
9455PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009456{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009457 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009458 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009459 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009460 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009461 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9462 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009463 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009464 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009465 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009466 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009467 int use_memcpy;
9468 unsigned char *res_data = NULL, *sep_data = NULL;
9469 PyObject *last_obj;
9470 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009471
Tim Peters05eba1f2004-08-27 21:32:02 +00009472 fseq = PySequence_Fast(seq, "");
9473 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009474 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009475 }
9476
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009477 /* NOTE: the following code can't call back into Python code,
9478 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009479 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009480
Tim Peters05eba1f2004-08-27 21:32:02 +00009481 seqlen = PySequence_Fast_GET_SIZE(fseq);
9482 /* If empty sequence, return u"". */
9483 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009484 Py_DECREF(fseq);
9485 Py_INCREF(unicode_empty);
9486 res = unicode_empty;
9487 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009488 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009489
Tim Peters05eba1f2004-08-27 21:32:02 +00009490 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009491 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009492 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009493 if (seqlen == 1) {
9494 if (PyUnicode_CheckExact(items[0])) {
9495 res = items[0];
9496 Py_INCREF(res);
9497 Py_DECREF(fseq);
9498 return res;
9499 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009500 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009501 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009502 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009503 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009504 /* Set up sep and seplen */
9505 if (separator == NULL) {
9506 /* fall back to a blank space separator */
9507 sep = PyUnicode_FromOrdinal(' ');
9508 if (!sep)
9509 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009510 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009511 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009512 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009513 else {
9514 if (!PyUnicode_Check(separator)) {
9515 PyErr_Format(PyExc_TypeError,
9516 "separator: expected str instance,"
9517 " %.80s found",
9518 Py_TYPE(separator)->tp_name);
9519 goto onError;
9520 }
9521 if (PyUnicode_READY(separator))
9522 goto onError;
9523 sep = separator;
9524 seplen = PyUnicode_GET_LENGTH(separator);
9525 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9526 /* inc refcount to keep this code path symmetric with the
9527 above case of a blank separator */
9528 Py_INCREF(sep);
9529 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009530 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009531 }
9532
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009533 /* There are at least two things to join, or else we have a subclass
9534 * of str in the sequence.
9535 * Do a pre-pass to figure out the total amount of space we'll
9536 * need (sz), and see whether all argument are strings.
9537 */
9538 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009539#ifdef Py_DEBUG
9540 use_memcpy = 0;
9541#else
9542 use_memcpy = 1;
9543#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009544 for (i = 0; i < seqlen; i++) {
9545 const Py_ssize_t old_sz = sz;
9546 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009547 if (!PyUnicode_Check(item)) {
9548 PyErr_Format(PyExc_TypeError,
9549 "sequence item %zd: expected str instance,"
9550 " %.80s found",
9551 i, Py_TYPE(item)->tp_name);
9552 goto onError;
9553 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009554 if (PyUnicode_READY(item) == -1)
9555 goto onError;
9556 sz += PyUnicode_GET_LENGTH(item);
9557 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnere6abb482012-05-02 01:15:40 +02009558 maxchar = MAX_MAXCHAR(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009559 if (i != 0)
9560 sz += seplen;
9561 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9562 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009563 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009564 goto onError;
9565 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009566 if (use_memcpy && last_obj != NULL) {
9567 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9568 use_memcpy = 0;
9569 }
9570 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009571 }
Tim Petersced69f82003-09-16 20:30:58 +00009572
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009573 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009574 if (res == NULL)
9575 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009576
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009577 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009578#ifdef Py_DEBUG
9579 use_memcpy = 0;
9580#else
9581 if (use_memcpy) {
9582 res_data = PyUnicode_1BYTE_DATA(res);
9583 kind = PyUnicode_KIND(res);
9584 if (seplen != 0)
9585 sep_data = PyUnicode_1BYTE_DATA(sep);
9586 }
9587#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009588 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009589 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009590 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009591 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009592 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009593 if (use_memcpy) {
9594 Py_MEMCPY(res_data,
9595 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009596 kind * seplen);
9597 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009598 }
9599 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009600 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009601 res_offset += seplen;
9602 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009603 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009604 itemlen = PyUnicode_GET_LENGTH(item);
9605 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009606 if (use_memcpy) {
9607 Py_MEMCPY(res_data,
9608 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009609 kind * itemlen);
9610 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009611 }
9612 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009613 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009614 res_offset += itemlen;
9615 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009616 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009617 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009618 if (use_memcpy)
9619 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009620 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009621 else
9622 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009623
Tim Peters05eba1f2004-08-27 21:32:02 +00009624 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009625 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009626 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009627 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009628
Benjamin Peterson29060642009-01-31 22:14:21 +00009629 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009630 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009631 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009632 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009633 return NULL;
9634}
9635
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009636#define FILL(kind, data, value, start, length) \
9637 do { \
9638 Py_ssize_t i_ = 0; \
9639 assert(kind != PyUnicode_WCHAR_KIND); \
9640 switch ((kind)) { \
9641 case PyUnicode_1BYTE_KIND: { \
9642 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +02009643 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009644 break; \
9645 } \
9646 case PyUnicode_2BYTE_KIND: { \
9647 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9648 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9649 break; \
9650 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009651 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009652 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9653 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9654 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009655 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009656 } \
9657 } \
9658 } while (0)
9659
Victor Stinnerd3f08822012-05-29 12:57:52 +02009660void
9661_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9662 Py_UCS4 fill_char)
9663{
9664 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9665 const void *data = PyUnicode_DATA(unicode);
9666 assert(PyUnicode_IS_READY(unicode));
9667 assert(unicode_modifiable(unicode));
9668 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9669 assert(start >= 0);
9670 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9671 FILL(kind, data, fill_char, start, length);
9672}
9673
Victor Stinner3fe55312012-01-04 00:33:50 +01009674Py_ssize_t
9675PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9676 Py_UCS4 fill_char)
9677{
9678 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +01009679
9680 if (!PyUnicode_Check(unicode)) {
9681 PyErr_BadInternalCall();
9682 return -1;
9683 }
9684 if (PyUnicode_READY(unicode) == -1)
9685 return -1;
9686 if (unicode_check_modifiable(unicode))
9687 return -1;
9688
Victor Stinnerd3f08822012-05-29 12:57:52 +02009689 if (start < 0) {
9690 PyErr_SetString(PyExc_IndexError, "string index out of range");
9691 return -1;
9692 }
Victor Stinner3fe55312012-01-04 00:33:50 +01009693 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9694 PyErr_SetString(PyExc_ValueError,
9695 "fill character is bigger than "
9696 "the string maximum character");
9697 return -1;
9698 }
9699
9700 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9701 length = Py_MIN(maxlen, length);
9702 if (length <= 0)
9703 return 0;
9704
Victor Stinnerd3f08822012-05-29 12:57:52 +02009705 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +01009706 return length;
9707}
9708
Victor Stinner9310abb2011-10-05 00:59:23 +02009709static PyObject *
9710pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009711 Py_ssize_t left,
9712 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009713 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009714{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009715 PyObject *u;
9716 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009717 int kind;
9718 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009719
9720 if (left < 0)
9721 left = 0;
9722 if (right < 0)
9723 right = 0;
9724
Victor Stinnerc4b49542011-12-11 22:44:26 +01009725 if (left == 0 && right == 0)
9726 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009727
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009728 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9729 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009730 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9731 return NULL;
9732 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009733 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02009734 maxchar = MAX_MAXCHAR(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009735 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009736 if (!u)
9737 return NULL;
9738
9739 kind = PyUnicode_KIND(u);
9740 data = PyUnicode_DATA(u);
9741 if (left)
9742 FILL(kind, data, fill, 0, left);
9743 if (right)
9744 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +02009745 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009746 assert(_PyUnicode_CheckConsistency(u, 1));
9747 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009748}
9749
Alexander Belopolsky40018472011-02-26 01:02:56 +00009750PyObject *
9751PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009752{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009753 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009754
9755 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009756 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009757 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -06009758 if (PyUnicode_READY(string) == -1) {
9759 Py_DECREF(string);
9760 return NULL;
9761 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009762
Benjamin Petersonead6b532011-12-20 17:23:42 -06009763 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009764 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009765 if (PyUnicode_IS_ASCII(string))
9766 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009767 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009768 PyUnicode_GET_LENGTH(string), keepends);
9769 else
9770 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009771 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009772 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009773 break;
9774 case PyUnicode_2BYTE_KIND:
9775 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009776 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009777 PyUnicode_GET_LENGTH(string), keepends);
9778 break;
9779 case PyUnicode_4BYTE_KIND:
9780 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009781 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009782 PyUnicode_GET_LENGTH(string), keepends);
9783 break;
9784 default:
9785 assert(0);
9786 list = 0;
9787 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009788 Py_DECREF(string);
9789 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009790}
9791
Alexander Belopolsky40018472011-02-26 01:02:56 +00009792static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009793split(PyObject *self,
9794 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009795 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009796{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009797 int kind1, kind2, kind;
9798 void *buf1, *buf2;
9799 Py_ssize_t len1, len2;
9800 PyObject* out;
9801
Guido van Rossumd57fd912000-03-10 22:53:23 +00009802 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009803 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009804
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009805 if (PyUnicode_READY(self) == -1)
9806 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009807
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009808 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009809 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009810 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009811 if (PyUnicode_IS_ASCII(self))
9812 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009813 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009814 PyUnicode_GET_LENGTH(self), maxcount
9815 );
9816 else
9817 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009818 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009819 PyUnicode_GET_LENGTH(self), maxcount
9820 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009821 case PyUnicode_2BYTE_KIND:
9822 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009823 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009824 PyUnicode_GET_LENGTH(self), maxcount
9825 );
9826 case PyUnicode_4BYTE_KIND:
9827 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009828 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009829 PyUnicode_GET_LENGTH(self), maxcount
9830 );
9831 default:
9832 assert(0);
9833 return NULL;
9834 }
9835
9836 if (PyUnicode_READY(substring) == -1)
9837 return NULL;
9838
9839 kind1 = PyUnicode_KIND(self);
9840 kind2 = PyUnicode_KIND(substring);
9841 kind = kind1 > kind2 ? kind1 : kind2;
9842 buf1 = PyUnicode_DATA(self);
9843 buf2 = PyUnicode_DATA(substring);
9844 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009845 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009846 if (!buf1)
9847 return NULL;
9848 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009849 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009850 if (!buf2) {
9851 if (kind1 != kind) PyMem_Free(buf1);
9852 return NULL;
9853 }
9854 len1 = PyUnicode_GET_LENGTH(self);
9855 len2 = PyUnicode_GET_LENGTH(substring);
9856
Benjamin Petersonead6b532011-12-20 17:23:42 -06009857 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009858 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009859 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9860 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009861 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009862 else
9863 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009864 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009865 break;
9866 case PyUnicode_2BYTE_KIND:
9867 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009868 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009869 break;
9870 case PyUnicode_4BYTE_KIND:
9871 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009872 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009873 break;
9874 default:
9875 out = NULL;
9876 }
9877 if (kind1 != kind)
9878 PyMem_Free(buf1);
9879 if (kind2 != kind)
9880 PyMem_Free(buf2);
9881 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009882}
9883
Alexander Belopolsky40018472011-02-26 01:02:56 +00009884static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009885rsplit(PyObject *self,
9886 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009887 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009888{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009889 int kind1, kind2, kind;
9890 void *buf1, *buf2;
9891 Py_ssize_t len1, len2;
9892 PyObject* out;
9893
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009894 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009895 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009896
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009897 if (PyUnicode_READY(self) == -1)
9898 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009899
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009900 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009901 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009902 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009903 if (PyUnicode_IS_ASCII(self))
9904 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009905 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009906 PyUnicode_GET_LENGTH(self), maxcount
9907 );
9908 else
9909 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009910 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009911 PyUnicode_GET_LENGTH(self), maxcount
9912 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009913 case PyUnicode_2BYTE_KIND:
9914 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009915 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009916 PyUnicode_GET_LENGTH(self), maxcount
9917 );
9918 case PyUnicode_4BYTE_KIND:
9919 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009920 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009921 PyUnicode_GET_LENGTH(self), maxcount
9922 );
9923 default:
9924 assert(0);
9925 return NULL;
9926 }
9927
9928 if (PyUnicode_READY(substring) == -1)
9929 return NULL;
9930
9931 kind1 = PyUnicode_KIND(self);
9932 kind2 = PyUnicode_KIND(substring);
9933 kind = kind1 > kind2 ? kind1 : kind2;
9934 buf1 = PyUnicode_DATA(self);
9935 buf2 = PyUnicode_DATA(substring);
9936 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009937 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009938 if (!buf1)
9939 return NULL;
9940 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009941 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009942 if (!buf2) {
9943 if (kind1 != kind) PyMem_Free(buf1);
9944 return NULL;
9945 }
9946 len1 = PyUnicode_GET_LENGTH(self);
9947 len2 = PyUnicode_GET_LENGTH(substring);
9948
Benjamin Petersonead6b532011-12-20 17:23:42 -06009949 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009950 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009951 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9952 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009953 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009954 else
9955 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009956 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009957 break;
9958 case PyUnicode_2BYTE_KIND:
9959 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009960 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009961 break;
9962 case PyUnicode_4BYTE_KIND:
9963 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009964 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009965 break;
9966 default:
9967 out = NULL;
9968 }
9969 if (kind1 != kind)
9970 PyMem_Free(buf1);
9971 if (kind2 != kind)
9972 PyMem_Free(buf2);
9973 return out;
9974}
9975
9976static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009977anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9978 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009979{
Benjamin Petersonead6b532011-12-20 17:23:42 -06009980 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009981 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009982 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9983 return asciilib_find(buf1, len1, buf2, len2, offset);
9984 else
9985 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009986 case PyUnicode_2BYTE_KIND:
9987 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9988 case PyUnicode_4BYTE_KIND:
9989 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9990 }
9991 assert(0);
9992 return -1;
9993}
9994
9995static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009996anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
9997 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009998{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -06009999 switch (kind) {
10000 case PyUnicode_1BYTE_KIND:
10001 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10002 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10003 else
10004 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10005 case PyUnicode_2BYTE_KIND:
10006 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10007 case PyUnicode_4BYTE_KIND:
10008 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10009 }
10010 assert(0);
10011 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010012}
10013
Alexander Belopolsky40018472011-02-26 01:02:56 +000010014static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010015replace(PyObject *self, PyObject *str1,
10016 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010017{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010018 PyObject *u;
10019 char *sbuf = PyUnicode_DATA(self);
10020 char *buf1 = PyUnicode_DATA(str1);
10021 char *buf2 = PyUnicode_DATA(str2);
10022 int srelease = 0, release1 = 0, release2 = 0;
10023 int skind = PyUnicode_KIND(self);
10024 int kind1 = PyUnicode_KIND(str1);
10025 int kind2 = PyUnicode_KIND(str2);
10026 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10027 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10028 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010029 int mayshrink;
10030 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010031
10032 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010033 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010034 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010035 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010036
Victor Stinner59de0ee2011-10-07 10:01:28 +020010037 if (str1 == str2)
10038 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010039 if (skind < kind1)
10040 /* substring too wide to be present */
10041 goto nothing;
10042
Victor Stinner49a0a212011-10-12 23:46:10 +020010043 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10044 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10045 /* Replacing str1 with str2 may cause a maxchar reduction in the
10046 result string. */
10047 mayshrink = (maxchar_str2 < maxchar);
Victor Stinnere6abb482012-05-02 01:15:40 +020010048 maxchar = MAX_MAXCHAR(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010049
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010050 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010051 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010052 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010053 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010054 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010055 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010056 Py_UCS4 u1, u2;
10057 int rkind;
Victor Stinnerf6441102011-12-18 02:43:08 +010010058 Py_ssize_t index, pos;
10059 char *src;
10060
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010061 u1 = PyUnicode_READ_CHAR(str1, 0);
Victor Stinnerf6441102011-12-18 02:43:08 +010010062 pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1);
10063 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010064 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010065 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010066 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010067 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010068 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010069 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010070 rkind = PyUnicode_KIND(u);
Victor Stinnerf6441102011-12-18 02:43:08 +010010071
10072 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2);
10073 index = 0;
10074 src = sbuf;
10075 while (--maxcount)
10076 {
10077 pos++;
10078 src += pos * PyUnicode_KIND(self);
10079 slen -= pos;
10080 index += pos;
10081 pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1);
10082 if (pos < 0)
10083 break;
10084 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2);
10085 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010086 }
10087 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010088 int rkind = skind;
10089 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010090 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010091
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010092 if (kind1 < rkind) {
10093 /* widen substring */
10094 buf1 = _PyUnicode_AsKind(str1, rkind);
10095 if (!buf1) goto error;
10096 release1 = 1;
10097 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010098 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010099 if (i < 0)
10100 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010101 if (rkind > kind2) {
10102 /* widen replacement */
10103 buf2 = _PyUnicode_AsKind(str2, rkind);
10104 if (!buf2) goto error;
10105 release2 = 1;
10106 }
10107 else if (rkind < kind2) {
10108 /* widen self and buf1 */
10109 rkind = kind2;
10110 if (release1) PyMem_Free(buf1);
10111 sbuf = _PyUnicode_AsKind(self, rkind);
10112 if (!sbuf) goto error;
10113 srelease = 1;
10114 buf1 = _PyUnicode_AsKind(str1, rkind);
10115 if (!buf1) goto error;
10116 release1 = 1;
10117 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010118 u = PyUnicode_New(slen, maxchar);
10119 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010120 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010121 assert(PyUnicode_KIND(u) == rkind);
10122 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010123
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010124 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010125 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010126 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010127 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010128 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010129 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010130
10131 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010132 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010133 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010134 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010135 if (i == -1)
10136 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010137 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010138 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010139 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010140 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010141 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010142 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010143 }
10144 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010145 Py_ssize_t n, i, j, ires;
10146 Py_ssize_t product, new_size;
10147 int rkind = skind;
10148 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010149
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010150 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010151 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010152 buf1 = _PyUnicode_AsKind(str1, rkind);
10153 if (!buf1) goto error;
10154 release1 = 1;
10155 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010156 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010157 if (n == 0)
10158 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010159 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010160 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010161 buf2 = _PyUnicode_AsKind(str2, rkind);
10162 if (!buf2) goto error;
10163 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010164 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010165 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010166 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010167 rkind = kind2;
10168 sbuf = _PyUnicode_AsKind(self, rkind);
10169 if (!sbuf) goto error;
10170 srelease = 1;
10171 if (release1) PyMem_Free(buf1);
10172 buf1 = _PyUnicode_AsKind(str1, rkind);
10173 if (!buf1) goto error;
10174 release1 = 1;
10175 }
10176 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10177 PyUnicode_GET_LENGTH(str1))); */
10178 product = n * (len2-len1);
10179 if ((product / (len2-len1)) != n) {
10180 PyErr_SetString(PyExc_OverflowError,
10181 "replace string is too long");
10182 goto error;
10183 }
10184 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +020010185 if (new_size == 0) {
10186 Py_INCREF(unicode_empty);
10187 u = unicode_empty;
10188 goto done;
10189 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010190 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10191 PyErr_SetString(PyExc_OverflowError,
10192 "replace string is too long");
10193 goto error;
10194 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010195 u = PyUnicode_New(new_size, maxchar);
10196 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010197 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010198 assert(PyUnicode_KIND(u) == rkind);
10199 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010200 ires = i = 0;
10201 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010202 while (n-- > 0) {
10203 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010204 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010205 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010206 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010207 if (j == -1)
10208 break;
10209 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010210 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010211 memcpy(res + rkind * ires,
10212 sbuf + rkind * i,
10213 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010214 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010215 }
10216 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010217 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010218 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010219 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010220 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010221 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010222 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010223 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010224 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010225 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010226 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010227 memcpy(res + rkind * ires,
10228 sbuf + rkind * i,
10229 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010230 }
10231 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010232 /* interleave */
10233 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010234 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010235 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010236 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010237 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010238 if (--n <= 0)
10239 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010240 memcpy(res + rkind * ires,
10241 sbuf + rkind * i,
10242 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010243 ires++;
10244 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010245 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010246 memcpy(res + rkind * ires,
10247 sbuf + rkind * i,
10248 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010249 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010250 }
10251
10252 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010253 unicode_adjust_maxchar(&u);
10254 if (u == NULL)
10255 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010256 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010257
10258 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010259 if (srelease)
10260 PyMem_FREE(sbuf);
10261 if (release1)
10262 PyMem_FREE(buf1);
10263 if (release2)
10264 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010265 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010266 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010267
Benjamin Peterson29060642009-01-31 22:14:21 +000010268 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010269 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010270 if (srelease)
10271 PyMem_FREE(sbuf);
10272 if (release1)
10273 PyMem_FREE(buf1);
10274 if (release2)
10275 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010276 return unicode_result_unchanged(self);
10277
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010278 error:
10279 if (srelease && sbuf)
10280 PyMem_FREE(sbuf);
10281 if (release1 && buf1)
10282 PyMem_FREE(buf1);
10283 if (release2 && buf2)
10284 PyMem_FREE(buf2);
10285 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010286}
10287
10288/* --- Unicode Object Methods --------------------------------------------- */
10289
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010290PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010291 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010292\n\
10293Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010294characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010295
10296static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010297unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010298{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010299 if (PyUnicode_READY(self) == -1)
10300 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010301 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010302}
10303
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010304PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010305 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010306\n\
10307Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010308have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010309
10310static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010311unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010312{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010313 if (PyUnicode_READY(self) == -1)
10314 return NULL;
10315 if (PyUnicode_GET_LENGTH(self) == 0)
10316 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010317 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010318}
10319
Benjamin Petersond5890c82012-01-14 13:23:30 -050010320PyDoc_STRVAR(casefold__doc__,
10321 "S.casefold() -> str\n\
10322\n\
10323Return a version of S suitable for caseless comparisons.");
10324
10325static PyObject *
10326unicode_casefold(PyObject *self)
10327{
10328 if (PyUnicode_READY(self) == -1)
10329 return NULL;
10330 if (PyUnicode_IS_ASCII(self))
10331 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010332 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010333}
10334
10335
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010336/* Argument converter. Coerces to a single unicode character */
10337
10338static int
10339convert_uc(PyObject *obj, void *addr)
10340{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010341 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010342 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010343
Benjamin Peterson14339b62009-01-31 16:36:08 +000010344 uniobj = PyUnicode_FromObject(obj);
10345 if (uniobj == NULL) {
10346 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010347 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010348 return 0;
10349 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010350 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010351 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010352 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010353 Py_DECREF(uniobj);
10354 return 0;
10355 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010356 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010357 Py_DECREF(uniobj);
10358 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010359}
10360
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010361PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010362 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010363\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010364Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010365done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010366
10367static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010368unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010369{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010370 Py_ssize_t marg, left;
10371 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010372 Py_UCS4 fillchar = ' ';
10373
Victor Stinnere9a29352011-10-01 02:14:59 +020010374 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010375 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010376
Benjamin Petersonbac79492012-01-14 13:34:47 -050010377 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010378 return NULL;
10379
Victor Stinnerc4b49542011-12-11 22:44:26 +010010380 if (PyUnicode_GET_LENGTH(self) >= width)
10381 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010382
Victor Stinnerc4b49542011-12-11 22:44:26 +010010383 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010384 left = marg / 2 + (marg & width & 1);
10385
Victor Stinner9310abb2011-10-05 00:59:23 +020010386 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010387}
10388
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010389/* This function assumes that str1 and str2 are readied by the caller. */
10390
Marc-André Lemburge5034372000-08-08 08:04:29 +000010391static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010392unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010393{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010394 int kind1, kind2;
10395 void *data1, *data2;
10396 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010397
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010398 kind1 = PyUnicode_KIND(str1);
10399 kind2 = PyUnicode_KIND(str2);
10400 data1 = PyUnicode_DATA(str1);
10401 data2 = PyUnicode_DATA(str2);
10402 len1 = PyUnicode_GET_LENGTH(str1);
10403 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010404
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010405 for (i = 0; i < len1 && i < len2; ++i) {
10406 Py_UCS4 c1, c2;
10407 c1 = PyUnicode_READ(kind1, data1, i);
10408 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010409
10410 if (c1 != c2)
10411 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010412 }
10413
10414 return (len1 < len2) ? -1 : (len1 != len2);
10415}
10416
Alexander Belopolsky40018472011-02-26 01:02:56 +000010417int
10418PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010419{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010420 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10421 if (PyUnicode_READY(left) == -1 ||
10422 PyUnicode_READY(right) == -1)
10423 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010424 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010425 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010426 PyErr_Format(PyExc_TypeError,
10427 "Can't compare %.100s and %.100s",
10428 left->ob_type->tp_name,
10429 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010430 return -1;
10431}
10432
Martin v. Löwis5b222132007-06-10 09:51:05 +000010433int
10434PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10435{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010436 Py_ssize_t i;
10437 int kind;
10438 void *data;
10439 Py_UCS4 chr;
10440
Victor Stinner910337b2011-10-03 03:20:16 +020010441 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010442 if (PyUnicode_READY(uni) == -1)
10443 return -1;
10444 kind = PyUnicode_KIND(uni);
10445 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010446 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010447 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10448 if (chr != str[i])
10449 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010450 /* This check keeps Python strings that end in '\0' from comparing equal
10451 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010452 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010453 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010454 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010455 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010456 return 0;
10457}
10458
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010459
Benjamin Peterson29060642009-01-31 22:14:21 +000010460#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010461 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010462
Alexander Belopolsky40018472011-02-26 01:02:56 +000010463PyObject *
10464PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010465{
10466 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010467
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010468 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10469 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010470 if (PyUnicode_READY(left) == -1 ||
10471 PyUnicode_READY(right) == -1)
10472 return NULL;
10473 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10474 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010475 if (op == Py_EQ) {
10476 Py_INCREF(Py_False);
10477 return Py_False;
10478 }
10479 if (op == Py_NE) {
10480 Py_INCREF(Py_True);
10481 return Py_True;
10482 }
10483 }
10484 if (left == right)
10485 result = 0;
10486 else
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010487 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010488
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010489 /* Convert the return value to a Boolean */
10490 switch (op) {
10491 case Py_EQ:
10492 v = TEST_COND(result == 0);
10493 break;
10494 case Py_NE:
10495 v = TEST_COND(result != 0);
10496 break;
10497 case Py_LE:
10498 v = TEST_COND(result <= 0);
10499 break;
10500 case Py_GE:
10501 v = TEST_COND(result >= 0);
10502 break;
10503 case Py_LT:
10504 v = TEST_COND(result == -1);
10505 break;
10506 case Py_GT:
10507 v = TEST_COND(result == 1);
10508 break;
10509 default:
10510 PyErr_BadArgument();
10511 return NULL;
10512 }
10513 Py_INCREF(v);
10514 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010515 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010516
Brian Curtindfc80e32011-08-10 20:28:54 -050010517 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010518}
10519
Alexander Belopolsky40018472011-02-26 01:02:56 +000010520int
10521PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010522{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010523 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010524 int kind1, kind2, kind;
10525 void *buf1, *buf2;
10526 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010527 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010528
10529 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010530 sub = PyUnicode_FromObject(element);
10531 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010532 PyErr_Format(PyExc_TypeError,
10533 "'in <string>' requires string as left operand, not %s",
10534 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010535 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010536 }
10537
Thomas Wouters477c8d52006-05-27 19:21:47 +000010538 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010539 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010540 Py_DECREF(sub);
10541 return -1;
10542 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060010543 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
10544 Py_DECREF(sub);
10545 Py_DECREF(str);
10546 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010547
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010548 kind1 = PyUnicode_KIND(str);
10549 kind2 = PyUnicode_KIND(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010550 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010551 buf1 = PyUnicode_DATA(str);
10552 buf2 = PyUnicode_DATA(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010553 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +020010554 if (kind2 > kind) {
10555 Py_DECREF(sub);
10556 Py_DECREF(str);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010557 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +020010558 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010010559 buf2 = _PyUnicode_AsKind(sub, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010560 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010561 if (!buf2) {
10562 Py_DECREF(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010563 Py_DECREF(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010564 return -1;
10565 }
10566 len1 = PyUnicode_GET_LENGTH(str);
10567 len2 = PyUnicode_GET_LENGTH(sub);
10568
Benjamin Petersonead6b532011-12-20 17:23:42 -060010569 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010570 case PyUnicode_1BYTE_KIND:
10571 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10572 break;
10573 case PyUnicode_2BYTE_KIND:
10574 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10575 break;
10576 case PyUnicode_4BYTE_KIND:
10577 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10578 break;
10579 default:
10580 result = -1;
10581 assert(0);
10582 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010583
10584 Py_DECREF(str);
10585 Py_DECREF(sub);
10586
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010587 if (kind2 != kind)
10588 PyMem_Free(buf2);
10589
Guido van Rossum403d68b2000-03-13 15:55:09 +000010590 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010591}
10592
Guido van Rossumd57fd912000-03-10 22:53:23 +000010593/* Concat to string or Unicode object giving a new Unicode object. */
10594
Alexander Belopolsky40018472011-02-26 01:02:56 +000010595PyObject *
10596PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010597{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010598 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010599 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010600 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010601
10602 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010603 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010604 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010605 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010606 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010607 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010608 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010609
10610 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010611 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010612 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010613 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010614 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010615 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010616 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010617 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010618 }
10619
Victor Stinner488fa492011-12-12 00:01:39 +010010620 u_len = PyUnicode_GET_LENGTH(u);
10621 v_len = PyUnicode_GET_LENGTH(v);
10622 if (u_len > PY_SSIZE_T_MAX - v_len) {
10623 PyErr_SetString(PyExc_OverflowError,
10624 "strings are too large to concat");
10625 goto onError;
10626 }
10627 new_len = u_len + v_len;
10628
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010629 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010630 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Victor Stinnere6abb482012-05-02 01:15:40 +020010631 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010632
Guido van Rossumd57fd912000-03-10 22:53:23 +000010633 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010010634 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010635 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010636 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010637 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
10638 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010639 Py_DECREF(u);
10640 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010641 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010642 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010643
Benjamin Peterson29060642009-01-31 22:14:21 +000010644 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010645 Py_XDECREF(u);
10646 Py_XDECREF(v);
10647 return NULL;
10648}
10649
Walter Dörwald1ab83302007-05-18 17:15:44 +000010650void
Victor Stinner23e56682011-10-03 03:54:37 +020010651PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010652{
Victor Stinner23e56682011-10-03 03:54:37 +020010653 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010010654 Py_UCS4 maxchar, maxchar2;
10655 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020010656
10657 if (p_left == NULL) {
10658 if (!PyErr_Occurred())
10659 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010660 return;
10661 }
Victor Stinner23e56682011-10-03 03:54:37 +020010662 left = *p_left;
10663 if (right == NULL || !PyUnicode_Check(left)) {
10664 if (!PyErr_Occurred())
10665 PyErr_BadInternalCall();
10666 goto error;
10667 }
10668
Benjamin Petersonbac79492012-01-14 13:34:47 -050010669 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010670 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050010671 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010672 goto error;
10673
Victor Stinner488fa492011-12-12 00:01:39 +010010674 /* Shortcuts */
10675 if (left == unicode_empty) {
10676 Py_DECREF(left);
10677 Py_INCREF(right);
10678 *p_left = right;
10679 return;
10680 }
10681 if (right == unicode_empty)
10682 return;
10683
10684 left_len = PyUnicode_GET_LENGTH(left);
10685 right_len = PyUnicode_GET_LENGTH(right);
10686 if (left_len > PY_SSIZE_T_MAX - right_len) {
10687 PyErr_SetString(PyExc_OverflowError,
10688 "strings are too large to concat");
10689 goto error;
10690 }
10691 new_len = left_len + right_len;
10692
10693 if (unicode_modifiable(left)
10694 && PyUnicode_CheckExact(right)
10695 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020010696 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10697 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010698 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010699 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010010700 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10701 {
10702 /* append inplace */
10703 if (unicode_resize(p_left, new_len) != 0) {
10704 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10705 * deallocated so it cannot be put back into
10706 * 'variable'. The MemoryError is raised when there
10707 * is no value in 'variable', which might (very
10708 * remotely) be a cause of incompatibilities.
10709 */
10710 goto error;
Victor Stinner23e56682011-10-03 03:54:37 +020010711 }
Victor Stinner488fa492011-12-12 00:01:39 +010010712 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerd3f08822012-05-29 12:57:52 +020010713 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020010714 }
Victor Stinner488fa492011-12-12 00:01:39 +010010715 else {
10716 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
10717 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Victor Stinnere6abb482012-05-02 01:15:40 +020010718 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020010719
Victor Stinner488fa492011-12-12 00:01:39 +010010720 /* Concat the two Unicode strings */
10721 res = PyUnicode_New(new_len, maxchar);
10722 if (res == NULL)
10723 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010724 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
10725 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010010726 Py_DECREF(left);
10727 *p_left = res;
10728 }
10729 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010730 return;
10731
10732error:
Victor Stinner488fa492011-12-12 00:01:39 +010010733 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010734}
10735
10736void
10737PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10738{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010739 PyUnicode_Append(pleft, right);
10740 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010741}
10742
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010743PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010744 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010745\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010746Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010747string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010748interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010749
10750static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010751unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010752{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010753 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010754 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010755 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010756 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010757 int kind1, kind2, kind;
10758 void *buf1, *buf2;
10759 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010760
Jesus Ceaac451502011-04-20 17:09:23 +020010761 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10762 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010763 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010764
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010765 kind1 = PyUnicode_KIND(self);
10766 kind2 = PyUnicode_KIND(substring);
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040010767 if (kind2 > kind1)
10768 return PyLong_FromLong(0);
10769 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010770 buf1 = PyUnicode_DATA(self);
10771 buf2 = PyUnicode_DATA(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010772 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010773 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010774 if (!buf2) {
10775 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010776 return NULL;
10777 }
10778 len1 = PyUnicode_GET_LENGTH(self);
10779 len2 = PyUnicode_GET_LENGTH(substring);
10780
10781 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060010782 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010783 case PyUnicode_1BYTE_KIND:
10784 iresult = ucs1lib_count(
10785 ((Py_UCS1*)buf1) + start, end - start,
10786 buf2, len2, PY_SSIZE_T_MAX
10787 );
10788 break;
10789 case PyUnicode_2BYTE_KIND:
10790 iresult = ucs2lib_count(
10791 ((Py_UCS2*)buf1) + start, end - start,
10792 buf2, len2, PY_SSIZE_T_MAX
10793 );
10794 break;
10795 case PyUnicode_4BYTE_KIND:
10796 iresult = ucs4lib_count(
10797 ((Py_UCS4*)buf1) + start, end - start,
10798 buf2, len2, PY_SSIZE_T_MAX
10799 );
10800 break;
10801 default:
10802 assert(0); iresult = 0;
10803 }
10804
10805 result = PyLong_FromSsize_t(iresult);
10806
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010807 if (kind2 != kind)
10808 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010809
10810 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010811
Guido van Rossumd57fd912000-03-10 22:53:23 +000010812 return result;
10813}
10814
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010815PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010816 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010817\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010818Encode S using the codec registered for encoding. Default encoding\n\
10819is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010820handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010821a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10822'xmlcharrefreplace' as well as any other name registered with\n\
10823codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010824
10825static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010826unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010827{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010828 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010829 char *encoding = NULL;
10830 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010831
Benjamin Peterson308d6372009-09-18 21:42:35 +000010832 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10833 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010834 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010835 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010836}
10837
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010838PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010839 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010840\n\
10841Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010842If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010843
10844static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010845unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010846{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010847 Py_ssize_t i, j, line_pos, src_len, incr;
10848 Py_UCS4 ch;
10849 PyObject *u;
10850 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010851 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010852 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010853 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010854
10855 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010856 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010857
Antoine Pitrou22425222011-10-04 19:10:51 +020010858 if (PyUnicode_READY(self) == -1)
10859 return NULL;
10860
Thomas Wouters7e474022000-07-16 12:04:32 +000010861 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010862 src_len = PyUnicode_GET_LENGTH(self);
10863 i = j = line_pos = 0;
10864 kind = PyUnicode_KIND(self);
10865 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010866 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010867 for (; i < src_len; i++) {
10868 ch = PyUnicode_READ(kind, src_data, i);
10869 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010870 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010871 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010872 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010873 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010874 goto overflow;
10875 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010876 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010877 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010878 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010879 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010880 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010881 goto overflow;
10882 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010883 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010884 if (ch == '\n' || ch == '\r')
10885 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010886 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010887 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010010888 if (!found)
10889 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010890
Guido van Rossumd57fd912000-03-10 22:53:23 +000010891 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010892 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010893 if (!u)
10894 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010895 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010896
Antoine Pitroue71d5742011-10-04 15:55:09 +020010897 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010898
Antoine Pitroue71d5742011-10-04 15:55:09 +020010899 for (; i < src_len; i++) {
10900 ch = PyUnicode_READ(kind, src_data, i);
10901 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010902 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010903 incr = tabsize - (line_pos % tabsize);
10904 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010010905 FILL(kind, dest_data, ' ', j, incr);
10906 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010907 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010908 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010909 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010910 line_pos++;
10911 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010912 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010913 if (ch == '\n' || ch == '\r')
10914 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010915 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010916 }
10917 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010010918 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010919
Antoine Pitroue71d5742011-10-04 15:55:09 +020010920 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010921 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10922 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010923}
10924
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010925PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010926 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010927\n\
10928Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010929such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010930arguments start and end are interpreted as in slice notation.\n\
10931\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010932Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010933
10934static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010935unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010936{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010937 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010938 Py_ssize_t start;
10939 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010940 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010941
Jesus Ceaac451502011-04-20 17:09:23 +020010942 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10943 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010944 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010945
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010946 if (PyUnicode_READY(self) == -1)
10947 return NULL;
10948 if (PyUnicode_READY(substring) == -1)
10949 return NULL;
10950
Victor Stinner7931d9a2011-11-04 00:22:48 +010010951 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010952
10953 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010954
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010955 if (result == -2)
10956 return NULL;
10957
Christian Heimes217cfd12007-12-02 14:31:20 +000010958 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010959}
10960
10961static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010962unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010963{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020010964 void *data;
10965 enum PyUnicode_Kind kind;
10966 Py_UCS4 ch;
10967 PyObject *res;
10968
10969 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
10970 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010971 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020010972 }
10973 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
10974 PyErr_SetString(PyExc_IndexError, "string index out of range");
10975 return NULL;
10976 }
10977 kind = PyUnicode_KIND(self);
10978 data = PyUnicode_DATA(self);
10979 ch = PyUnicode_READ(kind, data, index);
10980 if (ch < 256)
10981 return get_latin1_char(ch);
10982
10983 res = PyUnicode_New(1, ch);
10984 if (res == NULL)
10985 return NULL;
10986 kind = PyUnicode_KIND(res);
10987 data = PyUnicode_DATA(res);
10988 PyUnicode_WRITE(kind, data, 0, ch);
10989 assert(_PyUnicode_CheckConsistency(res, 1));
10990 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010991}
10992
Guido van Rossumc2504932007-09-18 19:42:40 +000010993/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010994 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010995static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010996unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010997{
Guido van Rossumc2504932007-09-18 19:42:40 +000010998 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010010999 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011000
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011001#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011002 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011003#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011004 if (_PyUnicode_HASH(self) != -1)
11005 return _PyUnicode_HASH(self);
11006 if (PyUnicode_READY(self) == -1)
11007 return -1;
11008 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011009 /*
11010 We make the hash of the empty string be 0, rather than using
11011 (prefix ^ suffix), since this slightly obfuscates the hash secret
11012 */
11013 if (len == 0) {
11014 _PyUnicode_HASH(self) = 0;
11015 return 0;
11016 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011017
11018 /* The hash function as a macro, gets expanded three times below. */
Georg Brandl2fb477c2012-02-21 00:33:36 +010011019#define HASH(P) \
11020 x ^= (Py_uhash_t) *P << 7; \
11021 while (--len >= 0) \
11022 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011023
Georg Brandl2fb477c2012-02-21 00:33:36 +010011024 x = (Py_uhash_t) _Py_HashSecret.prefix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011025 switch (PyUnicode_KIND(self)) {
11026 case PyUnicode_1BYTE_KIND: {
11027 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11028 HASH(c);
11029 break;
11030 }
11031 case PyUnicode_2BYTE_KIND: {
11032 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11033 HASH(s);
11034 break;
11035 }
11036 default: {
11037 Py_UCS4 *l;
11038 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11039 "Impossible switch case in unicode_hash");
11040 l = PyUnicode_4BYTE_DATA(self);
11041 HASH(l);
11042 break;
11043 }
11044 }
Georg Brandl2fb477c2012-02-21 00:33:36 +010011045 x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
11046 x ^= (Py_uhash_t) _Py_HashSecret.suffix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011047
Guido van Rossumc2504932007-09-18 19:42:40 +000011048 if (x == -1)
11049 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011050 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011051 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011052}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011053#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011054
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011055PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011056 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011057\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011058Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011059
11060static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011061unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011062{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011063 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011064 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011065 Py_ssize_t start;
11066 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011067
Jesus Ceaac451502011-04-20 17:09:23 +020011068 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11069 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011070 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011071
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011072 if (PyUnicode_READY(self) == -1)
11073 return NULL;
11074 if (PyUnicode_READY(substring) == -1)
11075 return NULL;
11076
Victor Stinner7931d9a2011-11-04 00:22:48 +010011077 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011078
11079 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011080
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011081 if (result == -2)
11082 return NULL;
11083
Guido van Rossumd57fd912000-03-10 22:53:23 +000011084 if (result < 0) {
11085 PyErr_SetString(PyExc_ValueError, "substring not found");
11086 return NULL;
11087 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011088
Christian Heimes217cfd12007-12-02 14:31:20 +000011089 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011090}
11091
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011092PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011093 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011094\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011095Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011096at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011097
11098static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011099unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011100{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011101 Py_ssize_t i, length;
11102 int kind;
11103 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011104 int cased;
11105
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011106 if (PyUnicode_READY(self) == -1)
11107 return NULL;
11108 length = PyUnicode_GET_LENGTH(self);
11109 kind = PyUnicode_KIND(self);
11110 data = PyUnicode_DATA(self);
11111
Guido van Rossumd57fd912000-03-10 22:53:23 +000011112 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011113 if (length == 1)
11114 return PyBool_FromLong(
11115 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011116
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011117 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011118 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011119 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011120
Guido van Rossumd57fd912000-03-10 22:53:23 +000011121 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011122 for (i = 0; i < length; i++) {
11123 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011124
Benjamin Peterson29060642009-01-31 22:14:21 +000011125 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11126 return PyBool_FromLong(0);
11127 else if (!cased && Py_UNICODE_ISLOWER(ch))
11128 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011129 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011130 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011131}
11132
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011133PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011134 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011135\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011136Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011137at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011138
11139static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011140unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011141{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011142 Py_ssize_t i, length;
11143 int kind;
11144 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011145 int cased;
11146
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011147 if (PyUnicode_READY(self) == -1)
11148 return NULL;
11149 length = PyUnicode_GET_LENGTH(self);
11150 kind = PyUnicode_KIND(self);
11151 data = PyUnicode_DATA(self);
11152
Guido van Rossumd57fd912000-03-10 22:53:23 +000011153 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011154 if (length == 1)
11155 return PyBool_FromLong(
11156 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011157
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011158 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011159 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011160 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011161
Guido van Rossumd57fd912000-03-10 22:53:23 +000011162 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011163 for (i = 0; i < length; i++) {
11164 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011165
Benjamin Peterson29060642009-01-31 22:14:21 +000011166 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11167 return PyBool_FromLong(0);
11168 else if (!cased && Py_UNICODE_ISUPPER(ch))
11169 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011170 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011171 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011172}
11173
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011174PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011175 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011176\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011177Return True if S is a titlecased string and there is at least one\n\
11178character in S, i.e. upper- and titlecase characters may only\n\
11179follow uncased characters and lowercase characters only cased ones.\n\
11180Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011181
11182static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011183unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011184{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011185 Py_ssize_t i, length;
11186 int kind;
11187 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011188 int cased, previous_is_cased;
11189
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011190 if (PyUnicode_READY(self) == -1)
11191 return NULL;
11192 length = PyUnicode_GET_LENGTH(self);
11193 kind = PyUnicode_KIND(self);
11194 data = PyUnicode_DATA(self);
11195
Guido van Rossumd57fd912000-03-10 22:53:23 +000011196 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011197 if (length == 1) {
11198 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11199 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11200 (Py_UNICODE_ISUPPER(ch) != 0));
11201 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011202
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011203 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011204 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011205 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011206
Guido van Rossumd57fd912000-03-10 22:53:23 +000011207 cased = 0;
11208 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011209 for (i = 0; i < length; i++) {
11210 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011211
Benjamin Peterson29060642009-01-31 22:14:21 +000011212 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11213 if (previous_is_cased)
11214 return PyBool_FromLong(0);
11215 previous_is_cased = 1;
11216 cased = 1;
11217 }
11218 else if (Py_UNICODE_ISLOWER(ch)) {
11219 if (!previous_is_cased)
11220 return PyBool_FromLong(0);
11221 previous_is_cased = 1;
11222 cased = 1;
11223 }
11224 else
11225 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011226 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011227 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011228}
11229
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011230PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011231 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011232\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011233Return True if all characters in S are whitespace\n\
11234and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011235
11236static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011237unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011238{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011239 Py_ssize_t i, length;
11240 int kind;
11241 void *data;
11242
11243 if (PyUnicode_READY(self) == -1)
11244 return NULL;
11245 length = PyUnicode_GET_LENGTH(self);
11246 kind = PyUnicode_KIND(self);
11247 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011248
Guido van Rossumd57fd912000-03-10 22:53:23 +000011249 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011250 if (length == 1)
11251 return PyBool_FromLong(
11252 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011253
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011254 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011255 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011256 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011257
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011258 for (i = 0; i < length; i++) {
11259 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011260 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011261 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011262 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011263 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011264}
11265
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011266PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011267 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011268\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011269Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011270and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011271
11272static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011273unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011274{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011275 Py_ssize_t i, length;
11276 int kind;
11277 void *data;
11278
11279 if (PyUnicode_READY(self) == -1)
11280 return NULL;
11281 length = PyUnicode_GET_LENGTH(self);
11282 kind = PyUnicode_KIND(self);
11283 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011284
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011285 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011286 if (length == 1)
11287 return PyBool_FromLong(
11288 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011289
11290 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011291 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011292 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011293
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011294 for (i = 0; i < length; i++) {
11295 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011296 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011297 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011298 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011299}
11300
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011301PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011302 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011303\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011304Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011305and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011306
11307static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011308unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011309{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011310 int kind;
11311 void *data;
11312 Py_ssize_t len, i;
11313
11314 if (PyUnicode_READY(self) == -1)
11315 return NULL;
11316
11317 kind = PyUnicode_KIND(self);
11318 data = PyUnicode_DATA(self);
11319 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011320
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011321 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011322 if (len == 1) {
11323 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11324 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11325 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011326
11327 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011328 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011329 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011330
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011331 for (i = 0; i < len; i++) {
11332 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011333 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011334 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011335 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011336 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011337}
11338
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011339PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011340 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011341\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011342Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011343False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011344
11345static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011346unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011347{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011348 Py_ssize_t i, length;
11349 int kind;
11350 void *data;
11351
11352 if (PyUnicode_READY(self) == -1)
11353 return NULL;
11354 length = PyUnicode_GET_LENGTH(self);
11355 kind = PyUnicode_KIND(self);
11356 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011357
Guido van Rossumd57fd912000-03-10 22:53:23 +000011358 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011359 if (length == 1)
11360 return PyBool_FromLong(
11361 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011362
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011363 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011364 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011365 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011366
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011367 for (i = 0; i < length; i++) {
11368 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011369 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011370 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011371 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011372}
11373
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011374PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011375 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011376\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011377Return True if all characters in S are digits\n\
11378and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011379
11380static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011381unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011382{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011383 Py_ssize_t i, length;
11384 int kind;
11385 void *data;
11386
11387 if (PyUnicode_READY(self) == -1)
11388 return NULL;
11389 length = PyUnicode_GET_LENGTH(self);
11390 kind = PyUnicode_KIND(self);
11391 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011392
Guido van Rossumd57fd912000-03-10 22:53:23 +000011393 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011394 if (length == 1) {
11395 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11396 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11397 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011398
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011399 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011400 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011401 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011402
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011403 for (i = 0; i < length; i++) {
11404 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011405 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011406 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011407 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011408}
11409
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011410PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011411 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011412\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011413Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011414False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011415
11416static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011417unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011418{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011419 Py_ssize_t i, length;
11420 int kind;
11421 void *data;
11422
11423 if (PyUnicode_READY(self) == -1)
11424 return NULL;
11425 length = PyUnicode_GET_LENGTH(self);
11426 kind = PyUnicode_KIND(self);
11427 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011428
Guido van Rossumd57fd912000-03-10 22:53:23 +000011429 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011430 if (length == 1)
11431 return PyBool_FromLong(
11432 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011433
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011434 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011435 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011436 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011437
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011438 for (i = 0; i < length; i++) {
11439 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011440 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011441 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011442 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011443}
11444
Martin v. Löwis47383402007-08-15 07:32:56 +000011445int
11446PyUnicode_IsIdentifier(PyObject *self)
11447{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011448 int kind;
11449 void *data;
11450 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011451 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011452
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011453 if (PyUnicode_READY(self) == -1) {
11454 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011455 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011456 }
11457
11458 /* Special case for empty strings */
11459 if (PyUnicode_GET_LENGTH(self) == 0)
11460 return 0;
11461 kind = PyUnicode_KIND(self);
11462 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011463
11464 /* PEP 3131 says that the first character must be in
11465 XID_Start and subsequent characters in XID_Continue,
11466 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011467 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011468 letters, digits, underscore). However, given the current
11469 definition of XID_Start and XID_Continue, it is sufficient
11470 to check just for these, except that _ must be allowed
11471 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011472 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011473 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011474 return 0;
11475
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011476 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011477 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011478 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011479 return 1;
11480}
11481
11482PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011483 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011484\n\
11485Return True if S is a valid identifier according\n\
11486to the language definition.");
11487
11488static PyObject*
11489unicode_isidentifier(PyObject *self)
11490{
11491 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11492}
11493
Georg Brandl559e5d72008-06-11 18:37:52 +000011494PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011495 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011496\n\
11497Return True if all characters in S are considered\n\
11498printable in repr() or S is empty, False otherwise.");
11499
11500static PyObject*
11501unicode_isprintable(PyObject *self)
11502{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011503 Py_ssize_t i, length;
11504 int kind;
11505 void *data;
11506
11507 if (PyUnicode_READY(self) == -1)
11508 return NULL;
11509 length = PyUnicode_GET_LENGTH(self);
11510 kind = PyUnicode_KIND(self);
11511 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011512
11513 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011514 if (length == 1)
11515 return PyBool_FromLong(
11516 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011517
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011518 for (i = 0; i < length; i++) {
11519 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011520 Py_RETURN_FALSE;
11521 }
11522 }
11523 Py_RETURN_TRUE;
11524}
11525
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011526PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011527 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011528\n\
11529Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011530iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011531
11532static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011533unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011534{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011535 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011536}
11537
Martin v. Löwis18e16552006-02-15 17:27:45 +000011538static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011539unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011540{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011541 if (PyUnicode_READY(self) == -1)
11542 return -1;
11543 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011544}
11545
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011546PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011547 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011548\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011549Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011550done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011551
11552static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011553unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011554{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011555 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011556 Py_UCS4 fillchar = ' ';
11557
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011558 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011559 return NULL;
11560
Benjamin Petersonbac79492012-01-14 13:34:47 -050011561 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011562 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011563
Victor Stinnerc4b49542011-12-11 22:44:26 +010011564 if (PyUnicode_GET_LENGTH(self) >= width)
11565 return unicode_result_unchanged(self);
11566
11567 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011568}
11569
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011570PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011571 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011572\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011573Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011574
11575static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011576unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011577{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011578 if (PyUnicode_READY(self) == -1)
11579 return NULL;
11580 if (PyUnicode_IS_ASCII(self))
11581 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011582 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011583}
11584
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011585#define LEFTSTRIP 0
11586#define RIGHTSTRIP 1
11587#define BOTHSTRIP 2
11588
11589/* Arrays indexed by above */
11590static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11591
11592#define STRIPNAME(i) (stripformat[i]+3)
11593
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011594/* externally visible for str.strip(unicode) */
11595PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011596_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011597{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011598 void *data;
11599 int kind;
11600 Py_ssize_t i, j, len;
11601 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011602
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011603 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11604 return NULL;
11605
11606 kind = PyUnicode_KIND(self);
11607 data = PyUnicode_DATA(self);
11608 len = PyUnicode_GET_LENGTH(self);
11609 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11610 PyUnicode_DATA(sepobj),
11611 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011612
Benjamin Peterson14339b62009-01-31 16:36:08 +000011613 i = 0;
11614 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011615 while (i < len &&
11616 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011617 i++;
11618 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011619 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011620
Benjamin Peterson14339b62009-01-31 16:36:08 +000011621 j = len;
11622 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011623 do {
11624 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011625 } while (j >= i &&
11626 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011627 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011628 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011629
Victor Stinner7931d9a2011-11-04 00:22:48 +010011630 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011631}
11632
11633PyObject*
11634PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11635{
11636 unsigned char *data;
11637 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011638 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011639
Victor Stinnerde636f32011-10-01 03:55:54 +020011640 if (PyUnicode_READY(self) == -1)
11641 return NULL;
11642
Victor Stinner684d5fd2012-05-03 02:32:34 +020011643 length = PyUnicode_GET_LENGTH(self);
11644 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020011645
Victor Stinner684d5fd2012-05-03 02:32:34 +020011646 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011647 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011648
Victor Stinnerde636f32011-10-01 03:55:54 +020011649 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011650 PyErr_SetString(PyExc_IndexError, "string index out of range");
11651 return NULL;
11652 }
Victor Stinner684d5fd2012-05-03 02:32:34 +020011653 if (start >= length || end < start) {
Victor Stinner3a7f79772012-05-03 03:36:40 +020011654 Py_INCREF(unicode_empty);
11655 return unicode_empty;
Victor Stinner684d5fd2012-05-03 02:32:34 +020011656 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020011657
Victor Stinner684d5fd2012-05-03 02:32:34 +020011658 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020011659 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020011660 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020011661 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020011662 }
11663 else {
11664 kind = PyUnicode_KIND(self);
11665 data = PyUnicode_1BYTE_DATA(self);
11666 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011667 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011668 length);
11669 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011670}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011671
11672static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011673do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011674{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011675 int kind;
11676 void *data;
11677 Py_ssize_t len, i, j;
11678
11679 if (PyUnicode_READY(self) == -1)
11680 return NULL;
11681
11682 kind = PyUnicode_KIND(self);
11683 data = PyUnicode_DATA(self);
11684 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011685
Benjamin Peterson14339b62009-01-31 16:36:08 +000011686 i = 0;
11687 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011688 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011689 i++;
11690 }
11691 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011692
Benjamin Peterson14339b62009-01-31 16:36:08 +000011693 j = len;
11694 if (striptype != LEFTSTRIP) {
11695 do {
11696 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011697 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011698 j++;
11699 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011700
Victor Stinner7931d9a2011-11-04 00:22:48 +010011701 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011702}
11703
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011704
11705static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011706do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011707{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011708 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011709
Benjamin Peterson14339b62009-01-31 16:36:08 +000011710 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11711 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011712
Benjamin Peterson14339b62009-01-31 16:36:08 +000011713 if (sep != NULL && sep != Py_None) {
11714 if (PyUnicode_Check(sep))
11715 return _PyUnicode_XStrip(self, striptype, sep);
11716 else {
11717 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011718 "%s arg must be None or str",
11719 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011720 return NULL;
11721 }
11722 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011723
Benjamin Peterson14339b62009-01-31 16:36:08 +000011724 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011725}
11726
11727
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011728PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011729 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011730\n\
11731Return a copy of the string S with leading and trailing\n\
11732whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011733If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011734
11735static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011736unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011737{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011738 if (PyTuple_GET_SIZE(args) == 0)
11739 return do_strip(self, BOTHSTRIP); /* Common case */
11740 else
11741 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011742}
11743
11744
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011745PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011746 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011747\n\
11748Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011749If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011750
11751static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011752unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011753{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011754 if (PyTuple_GET_SIZE(args) == 0)
11755 return do_strip(self, LEFTSTRIP); /* Common case */
11756 else
11757 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011758}
11759
11760
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011761PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011762 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011763\n\
11764Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011765If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011766
11767static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011768unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011769{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011770 if (PyTuple_GET_SIZE(args) == 0)
11771 return do_strip(self, RIGHTSTRIP); /* Common case */
11772 else
11773 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011774}
11775
11776
Guido van Rossumd57fd912000-03-10 22:53:23 +000011777static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011778unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011779{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011780 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011781 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011782
Georg Brandl222de0f2009-04-12 12:01:50 +000011783 if (len < 1) {
11784 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011785 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011786 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011787
Victor Stinnerc4b49542011-12-11 22:44:26 +010011788 /* no repeat, return original string */
11789 if (len == 1)
11790 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000011791
Benjamin Petersonbac79492012-01-14 13:34:47 -050011792 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011793 return NULL;
11794
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011795 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011796 PyErr_SetString(PyExc_OverflowError,
11797 "repeated string is too long");
11798 return NULL;
11799 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011800 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011801
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011802 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011803 if (!u)
11804 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011805 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011806
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011807 if (PyUnicode_GET_LENGTH(str) == 1) {
11808 const int kind = PyUnicode_KIND(str);
11809 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010011810 if (kind == PyUnicode_1BYTE_KIND) {
11811 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011812 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010011813 }
11814 else if (kind == PyUnicode_2BYTE_KIND) {
11815 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011816 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010011817 ucs2[n] = fill_char;
11818 } else {
11819 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
11820 assert(kind == PyUnicode_4BYTE_KIND);
11821 for (n = 0; n < len; ++n)
11822 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011823 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011824 }
11825 else {
11826 /* number of characters copied this far */
11827 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011828 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011829 char *to = (char *) PyUnicode_DATA(u);
11830 Py_MEMCPY(to, PyUnicode_DATA(str),
11831 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011832 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011833 n = (done <= nchars-done) ? done : nchars-done;
11834 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011835 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011836 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011837 }
11838
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011839 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011840 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011841}
11842
Alexander Belopolsky40018472011-02-26 01:02:56 +000011843PyObject *
11844PyUnicode_Replace(PyObject *obj,
11845 PyObject *subobj,
11846 PyObject *replobj,
11847 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011848{
11849 PyObject *self;
11850 PyObject *str1;
11851 PyObject *str2;
11852 PyObject *result;
11853
11854 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011855 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011856 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011857 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011858 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011859 Py_DECREF(self);
11860 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011861 }
11862 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011863 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011864 Py_DECREF(self);
11865 Py_DECREF(str1);
11866 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011867 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060011868 if (PyUnicode_READY(self) == -1 ||
11869 PyUnicode_READY(str1) == -1 ||
11870 PyUnicode_READY(str2) == -1)
11871 result = NULL;
11872 else
11873 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011874 Py_DECREF(self);
11875 Py_DECREF(str1);
11876 Py_DECREF(str2);
11877 return result;
11878}
11879
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011880PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011881 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011882\n\
11883Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011884old replaced by new. If the optional argument count is\n\
11885given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011886
11887static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011888unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011889{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011890 PyObject *str1;
11891 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011892 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011893 PyObject *result;
11894
Martin v. Löwis18e16552006-02-15 17:27:45 +000011895 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011896 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060011897 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011898 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011899 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011900 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011901 return NULL;
11902 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011903 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011904 Py_DECREF(str1);
11905 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011906 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060011907 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
11908 result = NULL;
11909 else
11910 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011911
11912 Py_DECREF(str1);
11913 Py_DECREF(str2);
11914 return result;
11915}
11916
Alexander Belopolsky40018472011-02-26 01:02:56 +000011917static PyObject *
11918unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011919{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011920 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011921 Py_ssize_t isize;
11922 Py_ssize_t osize, squote, dquote, i, o;
11923 Py_UCS4 max, quote;
11924 int ikind, okind;
11925 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011926
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011927 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011928 return NULL;
11929
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011930 isize = PyUnicode_GET_LENGTH(unicode);
11931 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011932
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011933 /* Compute length of output, quote characters, and
11934 maximum character */
11935 osize = 2; /* quotes */
11936 max = 127;
11937 squote = dquote = 0;
11938 ikind = PyUnicode_KIND(unicode);
11939 for (i = 0; i < isize; i++) {
11940 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11941 switch (ch) {
11942 case '\'': squote++; osize++; break;
11943 case '"': dquote++; osize++; break;
11944 case '\\': case '\t': case '\r': case '\n':
11945 osize += 2; break;
11946 default:
11947 /* Fast-path ASCII */
11948 if (ch < ' ' || ch == 0x7f)
11949 osize += 4; /* \xHH */
11950 else if (ch < 0x7f)
11951 osize++;
11952 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11953 osize++;
11954 max = ch > max ? ch : max;
11955 }
11956 else if (ch < 0x100)
11957 osize += 4; /* \xHH */
11958 else if (ch < 0x10000)
11959 osize += 6; /* \uHHHH */
11960 else
11961 osize += 10; /* \uHHHHHHHH */
11962 }
11963 }
11964
11965 quote = '\'';
11966 if (squote) {
11967 if (dquote)
11968 /* Both squote and dquote present. Use squote,
11969 and escape them */
11970 osize += squote;
11971 else
11972 quote = '"';
11973 }
11974
11975 repr = PyUnicode_New(osize, max);
11976 if (repr == NULL)
11977 return NULL;
11978 okind = PyUnicode_KIND(repr);
11979 odata = PyUnicode_DATA(repr);
11980
11981 PyUnicode_WRITE(okind, odata, 0, quote);
11982 PyUnicode_WRITE(okind, odata, osize-1, quote);
11983
11984 for (i = 0, o = 1; i < isize; i++) {
11985 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011986
11987 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011988 if ((ch == quote) || (ch == '\\')) {
11989 PyUnicode_WRITE(okind, odata, o++, '\\');
11990 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011991 continue;
11992 }
11993
Benjamin Peterson29060642009-01-31 22:14:21 +000011994 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011995 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011996 PyUnicode_WRITE(okind, odata, o++, '\\');
11997 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011998 }
11999 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012000 PyUnicode_WRITE(okind, odata, o++, '\\');
12001 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012002 }
12003 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012004 PyUnicode_WRITE(okind, odata, o++, '\\');
12005 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012006 }
12007
12008 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012009 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012010 PyUnicode_WRITE(okind, odata, o++, '\\');
12011 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012012 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12013 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012014 }
12015
Georg Brandl559e5d72008-06-11 18:37:52 +000012016 /* Copy ASCII characters as-is */
12017 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012018 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012019 }
12020
Benjamin Peterson29060642009-01-31 22:14:21 +000012021 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000012022 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012023 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000012024 (categories Z* and C* except ASCII space)
12025 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012026 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012027 PyUnicode_WRITE(okind, odata, o++, '\\');
Georg Brandl559e5d72008-06-11 18:37:52 +000012028 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012029 if (ch <= 0xff) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012030 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012031 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12032 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012033 }
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012034 /* Map 16-bit characters to '\uxxxx' */
12035 else if (ch <= 0xffff) {
12036 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012037 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12038 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12039 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12040 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012041 }
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012042 /* Map 21-bit characters to '\U00xxxxxx' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012043 else {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012044 PyUnicode_WRITE(okind, odata, o++, 'U');
12045 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12046 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12047 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12048 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
Victor Stinnerf5cff562011-10-14 02:13:11 +020012049 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12050 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12051 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12052 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012053 }
12054 }
12055 /* Copy characters as-is */
12056 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012057 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012058 }
12059 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012060 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012061 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012062 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012063 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012064}
12065
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012066PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012067 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012068\n\
12069Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012070such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012071arguments start and end are interpreted as in slice notation.\n\
12072\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012073Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012074
12075static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012076unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012077{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012078 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012079 Py_ssize_t start;
12080 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012081 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012082
Jesus Ceaac451502011-04-20 17:09:23 +020012083 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12084 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012085 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012086
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012087 if (PyUnicode_READY(self) == -1)
12088 return NULL;
12089 if (PyUnicode_READY(substring) == -1)
12090 return NULL;
12091
Victor Stinner7931d9a2011-11-04 00:22:48 +010012092 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012093
12094 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012095
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012096 if (result == -2)
12097 return NULL;
12098
Christian Heimes217cfd12007-12-02 14:31:20 +000012099 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012100}
12101
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012102PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012103 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012104\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012105Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012106
12107static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012108unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012109{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012110 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012111 Py_ssize_t start;
12112 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012113 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012114
Jesus Ceaac451502011-04-20 17:09:23 +020012115 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12116 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012117 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012118
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012119 if (PyUnicode_READY(self) == -1)
12120 return NULL;
12121 if (PyUnicode_READY(substring) == -1)
12122 return NULL;
12123
Victor Stinner7931d9a2011-11-04 00:22:48 +010012124 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012125
12126 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012127
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012128 if (result == -2)
12129 return NULL;
12130
Guido van Rossumd57fd912000-03-10 22:53:23 +000012131 if (result < 0) {
12132 PyErr_SetString(PyExc_ValueError, "substring not found");
12133 return NULL;
12134 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012135
Christian Heimes217cfd12007-12-02 14:31:20 +000012136 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012137}
12138
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012139PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012140 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012141\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012142Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012143done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012144
12145static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012146unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012147{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012148 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012149 Py_UCS4 fillchar = ' ';
12150
Victor Stinnere9a29352011-10-01 02:14:59 +020012151 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012152 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012153
Benjamin Petersonbac79492012-01-14 13:34:47 -050012154 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012155 return NULL;
12156
Victor Stinnerc4b49542011-12-11 22:44:26 +010012157 if (PyUnicode_GET_LENGTH(self) >= width)
12158 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012159
Victor Stinnerc4b49542011-12-11 22:44:26 +010012160 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012161}
12162
Alexander Belopolsky40018472011-02-26 01:02:56 +000012163PyObject *
12164PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012165{
12166 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012167
Guido van Rossumd57fd912000-03-10 22:53:23 +000012168 s = PyUnicode_FromObject(s);
12169 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012170 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012171 if (sep != NULL) {
12172 sep = PyUnicode_FromObject(sep);
12173 if (sep == NULL) {
12174 Py_DECREF(s);
12175 return NULL;
12176 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012177 }
12178
Victor Stinner9310abb2011-10-05 00:59:23 +020012179 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012180
12181 Py_DECREF(s);
12182 Py_XDECREF(sep);
12183 return result;
12184}
12185
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012186PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012187 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012188\n\
12189Return a list of the words in S, using sep as the\n\
12190delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012191splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012192whitespace string is a separator and empty strings are\n\
12193removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012194
12195static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012196unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012197{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012198 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012199 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012200 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012201
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012202 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12203 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012204 return NULL;
12205
12206 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012207 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012208 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012209 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012210 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012211 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012212}
12213
Thomas Wouters477c8d52006-05-27 19:21:47 +000012214PyObject *
12215PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12216{
12217 PyObject* str_obj;
12218 PyObject* sep_obj;
12219 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012220 int kind1, kind2, kind;
12221 void *buf1 = NULL, *buf2 = NULL;
12222 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012223
12224 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012225 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012226 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012227 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012228 if (!sep_obj) {
12229 Py_DECREF(str_obj);
12230 return NULL;
12231 }
12232 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12233 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012234 Py_DECREF(str_obj);
12235 return NULL;
12236 }
12237
Victor Stinner14f8f022011-10-05 20:58:25 +020012238 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012239 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012240 kind = Py_MAX(kind1, kind2);
12241 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012242 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012243 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012244 if (!buf1)
12245 goto onError;
12246 buf2 = PyUnicode_DATA(sep_obj);
12247 if (kind2 != kind)
12248 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12249 if (!buf2)
12250 goto onError;
12251 len1 = PyUnicode_GET_LENGTH(str_obj);
12252 len2 = PyUnicode_GET_LENGTH(sep_obj);
12253
Benjamin Petersonead6b532011-12-20 17:23:42 -060012254 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012255 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012256 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12257 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12258 else
12259 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012260 break;
12261 case PyUnicode_2BYTE_KIND:
12262 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12263 break;
12264 case PyUnicode_4BYTE_KIND:
12265 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12266 break;
12267 default:
12268 assert(0);
12269 out = 0;
12270 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012271
12272 Py_DECREF(sep_obj);
12273 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012274 if (kind1 != kind)
12275 PyMem_Free(buf1);
12276 if (kind2 != kind)
12277 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012278
12279 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012280 onError:
12281 Py_DECREF(sep_obj);
12282 Py_DECREF(str_obj);
12283 if (kind1 != kind && buf1)
12284 PyMem_Free(buf1);
12285 if (kind2 != kind && buf2)
12286 PyMem_Free(buf2);
12287 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012288}
12289
12290
12291PyObject *
12292PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12293{
12294 PyObject* str_obj;
12295 PyObject* sep_obj;
12296 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012297 int kind1, kind2, kind;
12298 void *buf1 = NULL, *buf2 = NULL;
12299 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012300
12301 str_obj = PyUnicode_FromObject(str_in);
12302 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012303 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012304 sep_obj = PyUnicode_FromObject(sep_in);
12305 if (!sep_obj) {
12306 Py_DECREF(str_obj);
12307 return NULL;
12308 }
12309
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012310 kind1 = PyUnicode_KIND(str_in);
12311 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012312 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012313 buf1 = PyUnicode_DATA(str_in);
12314 if (kind1 != kind)
12315 buf1 = _PyUnicode_AsKind(str_in, kind);
12316 if (!buf1)
12317 goto onError;
12318 buf2 = PyUnicode_DATA(sep_obj);
12319 if (kind2 != kind)
12320 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12321 if (!buf2)
12322 goto onError;
12323 len1 = PyUnicode_GET_LENGTH(str_obj);
12324 len2 = PyUnicode_GET_LENGTH(sep_obj);
12325
Benjamin Petersonead6b532011-12-20 17:23:42 -060012326 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012327 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012328 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12329 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12330 else
12331 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012332 break;
12333 case PyUnicode_2BYTE_KIND:
12334 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12335 break;
12336 case PyUnicode_4BYTE_KIND:
12337 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12338 break;
12339 default:
12340 assert(0);
12341 out = 0;
12342 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012343
12344 Py_DECREF(sep_obj);
12345 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012346 if (kind1 != kind)
12347 PyMem_Free(buf1);
12348 if (kind2 != kind)
12349 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012350
12351 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012352 onError:
12353 Py_DECREF(sep_obj);
12354 Py_DECREF(str_obj);
12355 if (kind1 != kind && buf1)
12356 PyMem_Free(buf1);
12357 if (kind2 != kind && buf2)
12358 PyMem_Free(buf2);
12359 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012360}
12361
12362PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012363 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012364\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012365Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012366the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012367found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012368
12369static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012370unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012371{
Victor Stinner9310abb2011-10-05 00:59:23 +020012372 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012373}
12374
12375PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012376 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012377\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012378Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012379the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012380separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012381
12382static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012383unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012384{
Victor Stinner9310abb2011-10-05 00:59:23 +020012385 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012386}
12387
Alexander Belopolsky40018472011-02-26 01:02:56 +000012388PyObject *
12389PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012390{
12391 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012392
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012393 s = PyUnicode_FromObject(s);
12394 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012395 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012396 if (sep != NULL) {
12397 sep = PyUnicode_FromObject(sep);
12398 if (sep == NULL) {
12399 Py_DECREF(s);
12400 return NULL;
12401 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012402 }
12403
Victor Stinner9310abb2011-10-05 00:59:23 +020012404 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012405
12406 Py_DECREF(s);
12407 Py_XDECREF(sep);
12408 return result;
12409}
12410
12411PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012412 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012413\n\
12414Return a list of the words in S, using sep as the\n\
12415delimiter string, starting at the end of the string and\n\
12416working to the front. If maxsplit is given, at most maxsplit\n\
12417splits are done. If sep is not specified, any whitespace string\n\
12418is a separator.");
12419
12420static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012421unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012422{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012423 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012424 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012425 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012426
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012427 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12428 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012429 return NULL;
12430
12431 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012432 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012433 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012434 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012435 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012436 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012437}
12438
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012439PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012440 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012441\n\
12442Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012443Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012444is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012445
12446static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012447unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012448{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012449 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012450 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012451
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012452 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12453 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012454 return NULL;
12455
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012456 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012457}
12458
12459static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012460PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012461{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012462 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012463}
12464
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012465PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012466 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012467\n\
12468Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012469and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012470
12471static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012472unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012473{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012474 if (PyUnicode_READY(self) == -1)
12475 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012476 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012477}
12478
Georg Brandlceee0772007-11-27 23:48:05 +000012479PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012480 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012481\n\
12482Return a translation table usable for str.translate().\n\
12483If there is only one argument, it must be a dictionary mapping Unicode\n\
12484ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012485Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012486If there are two arguments, they must be strings of equal length, and\n\
12487in the resulting dictionary, each character in x will be mapped to the\n\
12488character at the same position in y. If there is a third argument, it\n\
12489must be a string, whose characters will be mapped to None in the result.");
12490
12491static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012492unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012493{
12494 PyObject *x, *y = NULL, *z = NULL;
12495 PyObject *new = NULL, *key, *value;
12496 Py_ssize_t i = 0;
12497 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012498
Georg Brandlceee0772007-11-27 23:48:05 +000012499 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12500 return NULL;
12501 new = PyDict_New();
12502 if (!new)
12503 return NULL;
12504 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012505 int x_kind, y_kind, z_kind;
12506 void *x_data, *y_data, *z_data;
12507
Georg Brandlceee0772007-11-27 23:48:05 +000012508 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012509 if (!PyUnicode_Check(x)) {
12510 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12511 "be a string if there is a second argument");
12512 goto err;
12513 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012514 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012515 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12516 "arguments must have equal length");
12517 goto err;
12518 }
12519 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012520 x_kind = PyUnicode_KIND(x);
12521 y_kind = PyUnicode_KIND(y);
12522 x_data = PyUnicode_DATA(x);
12523 y_data = PyUnicode_DATA(y);
12524 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12525 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012526 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012527 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012528 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012529 if (!value) {
12530 Py_DECREF(key);
12531 goto err;
12532 }
Georg Brandlceee0772007-11-27 23:48:05 +000012533 res = PyDict_SetItem(new, key, value);
12534 Py_DECREF(key);
12535 Py_DECREF(value);
12536 if (res < 0)
12537 goto err;
12538 }
12539 /* create entries for deleting chars in z */
12540 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012541 z_kind = PyUnicode_KIND(z);
12542 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012543 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012544 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012545 if (!key)
12546 goto err;
12547 res = PyDict_SetItem(new, key, Py_None);
12548 Py_DECREF(key);
12549 if (res < 0)
12550 goto err;
12551 }
12552 }
12553 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012554 int kind;
12555 void *data;
12556
Georg Brandlceee0772007-11-27 23:48:05 +000012557 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012558 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012559 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12560 "to maketrans it must be a dict");
12561 goto err;
12562 }
12563 /* copy entries into the new dict, converting string keys to int keys */
12564 while (PyDict_Next(x, &i, &key, &value)) {
12565 if (PyUnicode_Check(key)) {
12566 /* convert string keys to integer keys */
12567 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012568 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012569 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12570 "table must be of length 1");
12571 goto err;
12572 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012573 kind = PyUnicode_KIND(key);
12574 data = PyUnicode_DATA(key);
12575 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012576 if (!newkey)
12577 goto err;
12578 res = PyDict_SetItem(new, newkey, value);
12579 Py_DECREF(newkey);
12580 if (res < 0)
12581 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012582 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012583 /* just keep integer keys */
12584 if (PyDict_SetItem(new, key, value) < 0)
12585 goto err;
12586 } else {
12587 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12588 "be strings or integers");
12589 goto err;
12590 }
12591 }
12592 }
12593 return new;
12594 err:
12595 Py_DECREF(new);
12596 return NULL;
12597}
12598
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012599PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012600 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012601\n\
12602Return a copy of the string S, where all characters have been mapped\n\
12603through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012604Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012605Unmapped characters are left untouched. Characters mapped to None\n\
12606are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012607
12608static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012609unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012610{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012611 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012612}
12613
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012614PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012615 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012616\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012617Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012618
12619static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012620unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012621{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012622 if (PyUnicode_READY(self) == -1)
12623 return NULL;
12624 if (PyUnicode_IS_ASCII(self))
12625 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012626 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012627}
12628
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012629PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012630 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012631\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012632Pad a numeric string S with zeros on the left, to fill a field\n\
12633of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012634
12635static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012636unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012637{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012638 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012639 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012640 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012641 int kind;
12642 void *data;
12643 Py_UCS4 chr;
12644
Martin v. Löwis18e16552006-02-15 17:27:45 +000012645 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012646 return NULL;
12647
Benjamin Petersonbac79492012-01-14 13:34:47 -050012648 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012649 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012650
Victor Stinnerc4b49542011-12-11 22:44:26 +010012651 if (PyUnicode_GET_LENGTH(self) >= width)
12652 return unicode_result_unchanged(self);
12653
12654 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012655
12656 u = pad(self, fill, 0, '0');
12657
Walter Dörwald068325e2002-04-15 13:36:47 +000012658 if (u == NULL)
12659 return NULL;
12660
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012661 kind = PyUnicode_KIND(u);
12662 data = PyUnicode_DATA(u);
12663 chr = PyUnicode_READ(kind, data, fill);
12664
12665 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012666 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012667 PyUnicode_WRITE(kind, data, 0, chr);
12668 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012669 }
12670
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012671 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012672 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012673}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012674
12675#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012676static PyObject *
12677unicode__decimal2ascii(PyObject *self)
12678{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012679 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012680}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012681#endif
12682
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012683PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012684 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012685\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012686Return True if S starts with the specified prefix, False otherwise.\n\
12687With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012688With optional end, stop comparing S at that position.\n\
12689prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012690
12691static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012692unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012693 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012694{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012695 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012696 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012697 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012698 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012699 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012700
Jesus Ceaac451502011-04-20 17:09:23 +020012701 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012702 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012703 if (PyTuple_Check(subobj)) {
12704 Py_ssize_t i;
12705 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012706 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012707 if (substring == NULL)
12708 return NULL;
12709 result = tailmatch(self, substring, start, end, -1);
12710 Py_DECREF(substring);
12711 if (result) {
12712 Py_RETURN_TRUE;
12713 }
12714 }
12715 /* nothing matched */
12716 Py_RETURN_FALSE;
12717 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012718 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012719 if (substring == NULL) {
12720 if (PyErr_ExceptionMatches(PyExc_TypeError))
12721 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12722 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012723 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012724 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012725 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012726 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012727 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012728}
12729
12730
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012731PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012732 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012733\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012734Return True if S ends with the specified suffix, False otherwise.\n\
12735With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012736With optional end, stop comparing S at that position.\n\
12737suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012738
12739static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012740unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012741 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012742{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012743 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012744 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012745 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012746 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012747 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012748
Jesus Ceaac451502011-04-20 17:09:23 +020012749 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012750 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012751 if (PyTuple_Check(subobj)) {
12752 Py_ssize_t i;
12753 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012754 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012755 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012756 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012757 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012758 result = tailmatch(self, substring, start, end, +1);
12759 Py_DECREF(substring);
12760 if (result) {
12761 Py_RETURN_TRUE;
12762 }
12763 }
12764 Py_RETURN_FALSE;
12765 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012766 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012767 if (substring == NULL) {
12768 if (PyErr_ExceptionMatches(PyExc_TypeError))
12769 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12770 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012771 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012772 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012773 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012774 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012775 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012776}
12777
Victor Stinner202fdca2012-05-07 12:47:02 +020012778Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012779_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012780{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012781 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012782 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
12783 writer->data = PyUnicode_DATA(writer->buffer);
12784 writer->kind = PyUnicode_KIND(writer->buffer);
12785}
12786
Victor Stinnerd3f08822012-05-29 12:57:52 +020012787void
12788_PyUnicodeWriter_Init(_PyUnicodeWriter *writer, Py_ssize_t min_length)
Victor Stinner202fdca2012-05-07 12:47:02 +020012789{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012790 memset(writer, 0, sizeof(*writer));
12791#ifdef Py_DEBUG
12792 writer->kind = 5; /* invalid kind */
12793#endif
12794 writer->min_length = Py_MAX(min_length, 100);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012795 writer->overallocate = (min_length > 0);
Victor Stinner202fdca2012-05-07 12:47:02 +020012796}
12797
Victor Stinnerd3f08822012-05-29 12:57:52 +020012798int
12799_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
12800 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020012801{
12802 Py_ssize_t newlen;
12803 PyObject *newbuffer;
12804
Victor Stinnerd3f08822012-05-29 12:57:52 +020012805 assert(length > 0);
12806
Victor Stinner202fdca2012-05-07 12:47:02 +020012807 if (length > PY_SSIZE_T_MAX - writer->pos) {
12808 PyErr_NoMemory();
12809 return -1;
12810 }
12811 newlen = writer->pos + length;
12812
Victor Stinnerd3f08822012-05-29 12:57:52 +020012813 if (writer->buffer == NULL) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012814 if (writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012815 /* overallocate 25% to limit the number of resize */
12816 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12817 newlen += newlen / 4;
12818 if (newlen < writer->min_length)
12819 newlen = writer->min_length;
12820 }
12821 writer->buffer = PyUnicode_New(newlen, maxchar);
12822 if (writer->buffer == NULL)
12823 return -1;
12824 _PyUnicodeWriter_Update(writer);
12825 return 0;
12826 }
Victor Stinner202fdca2012-05-07 12:47:02 +020012827
Victor Stinnerd3f08822012-05-29 12:57:52 +020012828 if (newlen > writer->size) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012829 if (writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012830 /* overallocate 25% to limit the number of resize */
12831 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12832 newlen += newlen / 4;
12833 if (newlen < writer->min_length)
12834 newlen = writer->min_length;
12835 }
12836
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012837 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020012838 /* resize + widen */
12839 newbuffer = PyUnicode_New(newlen, maxchar);
12840 if (newbuffer == NULL)
12841 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012842 _PyUnicode_FastCopyCharacters(newbuffer, 0,
12843 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020012844 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012845 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020012846 }
12847 else {
12848 newbuffer = resize_compact(writer->buffer, newlen);
12849 if (newbuffer == NULL)
12850 return -1;
12851 }
12852 writer->buffer = newbuffer;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012853 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012854 }
12855 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012856 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012857 newbuffer = PyUnicode_New(writer->size, maxchar);
12858 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020012859 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012860 _PyUnicode_FastCopyCharacters(newbuffer, 0,
12861 writer->buffer, 0, writer->pos);
12862 Py_DECREF(writer->buffer);
12863 writer->buffer = newbuffer;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012864 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012865 }
12866 return 0;
12867}
12868
Victor Stinnerd3f08822012-05-29 12:57:52 +020012869int
12870_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
12871{
12872 Py_UCS4 maxchar;
12873 Py_ssize_t len;
12874
12875 if (PyUnicode_READY(str) == -1)
12876 return -1;
12877 len = PyUnicode_GET_LENGTH(str);
12878 if (len == 0)
12879 return 0;
12880 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
12881 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012882 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012883 Py_INCREF(str);
12884 writer->buffer = str;
12885 _PyUnicodeWriter_Update(writer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012886 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012887 writer->size = 0;
12888 writer->pos += len;
12889 return 0;
12890 }
12891 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
12892 return -1;
12893 }
12894 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
12895 str, 0, len);
12896 writer->pos += len;
12897 return 0;
12898}
12899
12900PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012901_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012902{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012903 if (writer->pos == 0) {
12904 Py_XDECREF(writer->buffer);
12905 Py_INCREF(unicode_empty);
12906 return unicode_empty;
12907 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012908 if (writer->readonly) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012909 assert(PyUnicode_GET_LENGTH(writer->buffer) == writer->pos);
12910 return writer->buffer;
12911 }
12912 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
12913 PyObject *newbuffer;
12914 newbuffer = resize_compact(writer->buffer, writer->pos);
12915 if (newbuffer == NULL) {
12916 Py_DECREF(writer->buffer);
12917 return NULL;
12918 }
12919 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020012920 }
Victor Stinnerf59c28c2012-05-09 03:24:14 +020012921 assert(_PyUnicode_CheckConsistency(writer->buffer, 1));
Victor Stinner202fdca2012-05-07 12:47:02 +020012922 return writer->buffer;
12923}
12924
Victor Stinnerd3f08822012-05-29 12:57:52 +020012925void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012926_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012927{
12928 Py_CLEAR(writer->buffer);
12929}
12930
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012931#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012932
12933PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012934 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012935\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012936Return a formatted version of S, using substitutions from args and kwargs.\n\
12937The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012938
Eric Smith27bbca62010-11-04 17:06:58 +000012939PyDoc_STRVAR(format_map__doc__,
12940 "S.format_map(mapping) -> str\n\
12941\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012942Return a formatted version of S, using substitutions from mapping.\n\
12943The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012944
Eric Smith4a7d76d2008-05-30 18:10:19 +000012945static PyObject *
12946unicode__format__(PyObject* self, PyObject* args)
12947{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012948 PyObject *format_spec;
12949 _PyUnicodeWriter writer;
12950 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012951
12952 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12953 return NULL;
12954
Victor Stinnerd3f08822012-05-29 12:57:52 +020012955 if (PyUnicode_READY(self) == -1)
12956 return NULL;
12957 _PyUnicodeWriter_Init(&writer, 0);
12958 ret = _PyUnicode_FormatAdvancedWriter(&writer,
12959 self, format_spec, 0,
12960 PyUnicode_GET_LENGTH(format_spec));
12961 if (ret == -1) {
12962 _PyUnicodeWriter_Dealloc(&writer);
12963 return NULL;
12964 }
12965 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000012966}
12967
Eric Smith8c663262007-08-25 02:26:07 +000012968PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012969 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012970\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012971Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012972
12973static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012974unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012975{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012976 Py_ssize_t size;
12977
12978 /* If it's a compact object, account for base structure +
12979 character data. */
12980 if (PyUnicode_IS_COMPACT_ASCII(v))
12981 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12982 else if (PyUnicode_IS_COMPACT(v))
12983 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012984 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012985 else {
12986 /* If it is a two-block object, account for base object, and
12987 for character block if present. */
12988 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012989 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012990 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012991 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012992 }
12993 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012994 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012995 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012996 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012997 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012998 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012999
13000 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013001}
13002
13003PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013004 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013005
13006static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013007unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013008{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013009 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013010 if (!copy)
13011 return NULL;
13012 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013013}
13014
Guido van Rossumd57fd912000-03-10 22:53:23 +000013015static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013016 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013017 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013018 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13019 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013020 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13021 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013022 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013023 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13024 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13025 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13026 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
13027 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013028 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013029 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13030 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13031 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013032 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013033 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13034 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13035 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013036 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013037 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010013038 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013039 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013040 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13041 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13042 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13043 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13044 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13045 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13046 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13047 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13048 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13049 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13050 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13051 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13052 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13053 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013054 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013055 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013056 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013057 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013058 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013059 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000013060 {"maketrans", (PyCFunction) unicode_maketrans,
13061 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013062 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013063#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013064 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013065 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013066#endif
13067
Benjamin Peterson14339b62009-01-31 16:36:08 +000013068 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013069 {NULL, NULL}
13070};
13071
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013072static PyObject *
13073unicode_mod(PyObject *v, PyObject *w)
13074{
Brian Curtindfc80e32011-08-10 20:28:54 -050013075 if (!PyUnicode_Check(v))
13076 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013077 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013078}
13079
13080static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013081 0, /*nb_add*/
13082 0, /*nb_subtract*/
13083 0, /*nb_multiply*/
13084 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013085};
13086
Guido van Rossumd57fd912000-03-10 22:53:23 +000013087static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013088 (lenfunc) unicode_length, /* sq_length */
13089 PyUnicode_Concat, /* sq_concat */
13090 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13091 (ssizeargfunc) unicode_getitem, /* sq_item */
13092 0, /* sq_slice */
13093 0, /* sq_ass_item */
13094 0, /* sq_ass_slice */
13095 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013096};
13097
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013098static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013099unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013100{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013101 if (PyUnicode_READY(self) == -1)
13102 return NULL;
13103
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013104 if (PyIndex_Check(item)) {
13105 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013106 if (i == -1 && PyErr_Occurred())
13107 return NULL;
13108 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013109 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013110 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013111 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013112 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013113 PyObject *result;
13114 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013115 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013116 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013117
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013118 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013119 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013120 return NULL;
13121 }
13122
13123 if (slicelength <= 0) {
Victor Stinner382955f2011-12-11 21:44:00 +010013124 Py_INCREF(unicode_empty);
13125 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013126 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013127 slicelength == PyUnicode_GET_LENGTH(self)) {
13128 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013129 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013130 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013131 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013132 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013133 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013134 src_kind = PyUnicode_KIND(self);
13135 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013136 if (!PyUnicode_IS_ASCII(self)) {
13137 kind_limit = kind_maxchar_limit(src_kind);
13138 max_char = 0;
13139 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13140 ch = PyUnicode_READ(src_kind, src_data, cur);
13141 if (ch > max_char) {
13142 max_char = ch;
13143 if (max_char >= kind_limit)
13144 break;
13145 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013146 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013147 }
Victor Stinner55c99112011-10-13 01:17:06 +020013148 else
13149 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013150 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013151 if (result == NULL)
13152 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013153 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013154 dest_data = PyUnicode_DATA(result);
13155
13156 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013157 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13158 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013159 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013160 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013161 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013162 } else {
13163 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13164 return NULL;
13165 }
13166}
13167
13168static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013169 (lenfunc)unicode_length, /* mp_length */
13170 (binaryfunc)unicode_subscript, /* mp_subscript */
13171 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013172};
13173
Guido van Rossumd57fd912000-03-10 22:53:23 +000013174
Guido van Rossumd57fd912000-03-10 22:53:23 +000013175/* Helpers for PyUnicode_Format() */
13176
13177static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000013178getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013179{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013180 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013181 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013182 (*p_argidx)++;
13183 if (arglen < 0)
13184 return args;
13185 else
13186 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013187 }
13188 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013189 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013190 return NULL;
13191}
13192
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013193/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013194
Victor Stinnerd3f08822012-05-29 12:57:52 +020013195static int
13196formatfloat(PyObject *v, int flags, int prec, int type,
13197 PyObject **p_output, _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013198{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013199 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013200 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013201 Py_ssize_t len;
Tim Petersced69f82003-09-16 20:30:58 +000013202
Guido van Rossumd57fd912000-03-10 22:53:23 +000013203 x = PyFloat_AsDouble(v);
13204 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013205 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013206
Guido van Rossumd57fd912000-03-10 22:53:23 +000013207 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013208 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013209
Eric Smith0923d1d2009-04-16 20:16:10 +000013210 p = PyOS_double_to_string(x, type, prec,
13211 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013212 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013213 return -1;
13214 len = strlen(p);
13215 if (writer) {
13216 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13217 return -1;
Victor Stinner3a7d0962012-05-29 18:53:56 +020013218 memcpy((char*)writer->data + writer->pos * writer->kind,
Victor Stinnerd3f08822012-05-29 12:57:52 +020013219 p,
13220 len);
13221 writer->pos += len;
13222 }
13223 else
13224 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013225 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013226 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013227}
13228
Victor Stinnerd0880d52012-04-27 23:40:13 +020013229/* formatlong() emulates the format codes d, u, o, x and X, and
13230 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13231 * Python's regular ints.
13232 * Return value: a new PyUnicodeObject*, or NULL if error.
13233 * The output string is of the form
13234 * "-"? ("0x" | "0X")? digit+
13235 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13236 * set in flags. The case of hex digits will be correct,
13237 * There will be at least prec digits, zero-filled on the left if
13238 * necessary to get that many.
13239 * val object to be converted
13240 * flags bitmask of format flags; only F_ALT is looked at
13241 * prec minimum number of digits; 0-fill on left if needed
13242 * type a character in [duoxX]; u acts the same as d
13243 *
13244 * CAUTION: o, x and X conversions on regular ints can never
13245 * produce a '-' sign, but can for Python's unbounded ints.
13246 */
Tim Peters38fd5b62000-09-21 05:43:11 +000013247static PyObject*
13248formatlong(PyObject *val, int flags, int prec, int type)
13249{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013250 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013251 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013252 Py_ssize_t i;
13253 int sign; /* 1 if '-', else 0 */
13254 int len; /* number of characters */
13255 Py_ssize_t llen;
13256 int numdigits; /* len == numnondigits + numdigits */
13257 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000013258
Victor Stinnerd0880d52012-04-27 23:40:13 +020013259 /* Avoid exceeding SSIZE_T_MAX */
13260 if (prec > INT_MAX-3) {
13261 PyErr_SetString(PyExc_OverflowError,
13262 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013263 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013264 }
13265
13266 assert(PyLong_Check(val));
13267
13268 switch (type) {
13269 case 'd':
13270 case 'u':
13271 /* Special-case boolean: we want 0/1 */
Victor Stinnerb11d91d2012-04-28 00:25:34 +020013272 if (PyBool_Check(val))
13273 result = PyNumber_ToBase(val, 10);
13274 else
13275 result = Py_TYPE(val)->tp_str(val);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013276 break;
13277 case 'o':
13278 numnondigits = 2;
13279 result = PyNumber_ToBase(val, 8);
13280 break;
13281 case 'x':
13282 case 'X':
13283 numnondigits = 2;
13284 result = PyNumber_ToBase(val, 16);
13285 break;
13286 default:
13287 assert(!"'type' not in [duoxX]");
13288 }
13289 if (!result)
13290 return NULL;
13291
13292 assert(unicode_modifiable(result));
13293 assert(PyUnicode_IS_READY(result));
13294 assert(PyUnicode_IS_ASCII(result));
13295
13296 /* To modify the string in-place, there can only be one reference. */
13297 if (Py_REFCNT(result) != 1) {
13298 PyErr_BadInternalCall();
13299 return NULL;
13300 }
13301 buf = PyUnicode_DATA(result);
13302 llen = PyUnicode_GET_LENGTH(result);
13303 if (llen > INT_MAX) {
13304 PyErr_SetString(PyExc_ValueError,
13305 "string too large in _PyBytes_FormatLong");
13306 return NULL;
13307 }
13308 len = (int)llen;
13309 sign = buf[0] == '-';
13310 numnondigits += sign;
13311 numdigits = len - numnondigits;
13312 assert(numdigits > 0);
13313
13314 /* Get rid of base marker unless F_ALT */
13315 if (((flags & F_ALT) == 0 &&
13316 (type == 'o' || type == 'x' || type == 'X'))) {
13317 assert(buf[sign] == '0');
13318 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13319 buf[sign+1] == 'o');
13320 numnondigits -= 2;
13321 buf += 2;
13322 len -= 2;
13323 if (sign)
13324 buf[0] = '-';
13325 assert(len == numnondigits + numdigits);
13326 assert(numdigits > 0);
13327 }
13328
13329 /* Fill with leading zeroes to meet minimum width. */
13330 if (prec > numdigits) {
13331 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13332 numnondigits + prec);
13333 char *b1;
13334 if (!r1) {
13335 Py_DECREF(result);
13336 return NULL;
13337 }
13338 b1 = PyBytes_AS_STRING(r1);
13339 for (i = 0; i < numnondigits; ++i)
13340 *b1++ = *buf++;
13341 for (i = 0; i < prec - numdigits; i++)
13342 *b1++ = '0';
13343 for (i = 0; i < numdigits; i++)
13344 *b1++ = *buf++;
13345 *b1 = '\0';
13346 Py_DECREF(result);
13347 result = r1;
13348 buf = PyBytes_AS_STRING(result);
13349 len = numnondigits + prec;
13350 }
13351
13352 /* Fix up case for hex conversions. */
13353 if (type == 'X') {
13354 /* Need to convert all lower case letters to upper case.
13355 and need to convert 0x to 0X (and -0x to -0X). */
13356 for (i = 0; i < len; i++)
13357 if (buf[i] >= 'a' && buf[i] <= 'x')
13358 buf[i] -= 'a'-'A';
13359 }
13360 if (!PyUnicode_Check(result) || len != PyUnicode_GET_LENGTH(result)) {
13361 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013362 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013363 Py_DECREF(result);
13364 result = unicode;
13365 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013366 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013367}
13368
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013369static Py_UCS4
13370formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013371{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013372 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013373 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013374 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013375 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013376 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013377 goto onError;
13378 }
13379 else {
13380 /* Integer input truncated to a character */
13381 long x;
13382 x = PyLong_AsLong(v);
13383 if (x == -1 && PyErr_Occurred())
13384 goto onError;
13385
Victor Stinner8faf8212011-12-08 22:14:11 +010013386 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013387 PyErr_SetString(PyExc_OverflowError,
13388 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013389 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013390 }
13391
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013392 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013393 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013394
Benjamin Peterson29060642009-01-31 22:14:21 +000013395 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013396 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013397 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013398 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013399}
13400
Alexander Belopolsky40018472011-02-26 01:02:56 +000013401PyObject *
13402PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013403{
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013404 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013405 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013406 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013407 PyObject *temp = NULL;
13408 PyObject *second = NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013409 PyObject *uformat;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013410 void *fmt;
13411 enum PyUnicode_Kind kind, fmtkind;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013412 _PyUnicodeWriter writer;
Victor Stinneree4544c2012-05-09 22:24:08 +020013413 Py_ssize_t sublen;
13414 Py_UCS4 maxchar;
Tim Petersced69f82003-09-16 20:30:58 +000013415
Guido van Rossumd57fd912000-03-10 22:53:23 +000013416 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013417 PyErr_BadInternalCall();
13418 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013419 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013420 uformat = PyUnicode_FromObject(format);
Benjamin Peterson22a29702012-01-02 09:00:30 -060013421 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013422 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060013423 if (PyUnicode_READY(uformat) == -1)
13424 Py_DECREF(uformat);
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013425
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013426 fmt = PyUnicode_DATA(uformat);
13427 fmtkind = PyUnicode_KIND(uformat);
13428 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13429 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013430
Victor Stinnerd3f08822012-05-29 12:57:52 +020013431 _PyUnicodeWriter_Init(&writer, fmtcnt + 100);
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013432
Guido van Rossumd57fd912000-03-10 22:53:23 +000013433 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013434 arglen = PyTuple_Size(args);
13435 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013436 }
13437 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013438 arglen = -1;
13439 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013440 }
Christian Heimes90aa7642007-12-19 02:45:37 +000013441 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000013442 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013443 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013444
13445 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013446 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013447 Py_ssize_t nonfmtpos;
13448 nonfmtpos = fmtpos++;
13449 while (fmtcnt >= 0 &&
13450 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13451 fmtpos++;
13452 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013453 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013454 if (fmtcnt < 0)
13455 fmtpos--;
Victor Stinneree4544c2012-05-09 22:24:08 +020013456 sublen = fmtpos - nonfmtpos;
13457 maxchar = _PyUnicode_FindMaxChar(uformat,
13458 nonfmtpos, nonfmtpos + sublen);
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013459 if (_PyUnicodeWriter_Prepare(&writer, sublen, maxchar) == -1)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013460 goto onError;
Victor Stinneree4544c2012-05-09 22:24:08 +020013461
Victor Stinnerd3f08822012-05-29 12:57:52 +020013462 _PyUnicode_FastCopyCharacters(writer.buffer, writer.pos,
13463 uformat, nonfmtpos, sublen);
Victor Stinneree4544c2012-05-09 22:24:08 +020013464 writer.pos += sublen;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013465 }
13466 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013467 /* Got a format specifier */
13468 int flags = 0;
13469 Py_ssize_t width = -1;
13470 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013471 Py_UCS4 c = '\0';
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013472 Py_UCS4 fill;
13473 int sign;
13474 Py_UCS4 signchar;
Benjamin Peterson29060642009-01-31 22:14:21 +000013475 int isnumok;
13476 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013477 void *pbuf = NULL;
13478 Py_ssize_t pindex, len;
Victor Stinneree4544c2012-05-09 22:24:08 +020013479 Py_UCS4 bufmaxchar;
13480 Py_ssize_t buflen;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013481
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013482 fmtpos++;
Victor Stinner438106b2012-05-02 00:41:57 +020013483 c = PyUnicode_READ(fmtkind, fmt, fmtpos);
13484 if (c == '(') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013485 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013486 Py_ssize_t keylen;
13487 PyObject *key;
13488 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013489
Benjamin Peterson29060642009-01-31 22:14:21 +000013490 if (dict == NULL) {
13491 PyErr_SetString(PyExc_TypeError,
13492 "format requires a mapping");
13493 goto onError;
13494 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013495 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013496 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013497 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013498 /* Skip over balanced parentheses */
13499 while (pcount > 0 && --fmtcnt >= 0) {
Victor Stinnerbff7c962012-05-03 01:44:59 +020013500 c = PyUnicode_READ(fmtkind, fmt, fmtpos);
13501 if (c == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013502 --pcount;
Victor Stinnerbff7c962012-05-03 01:44:59 +020013503 else if (c == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013504 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013505 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013506 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013507 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013508 if (fmtcnt < 0 || pcount > 0) {
13509 PyErr_SetString(PyExc_ValueError,
13510 "incomplete format key");
13511 goto onError;
13512 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013513 key = PyUnicode_Substring(uformat,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013514 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013515 if (key == NULL)
13516 goto onError;
13517 if (args_owned) {
13518 Py_DECREF(args);
13519 args_owned = 0;
13520 }
13521 args = PyObject_GetItem(dict, key);
13522 Py_DECREF(key);
13523 if (args == NULL) {
13524 goto onError;
13525 }
13526 args_owned = 1;
13527 arglen = -1;
13528 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013529 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013530 while (--fmtcnt >= 0) {
Victor Stinner438106b2012-05-02 00:41:57 +020013531 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13532 switch (c) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013533 case '-': flags |= F_LJUST; continue;
13534 case '+': flags |= F_SIGN; continue;
13535 case ' ': flags |= F_BLANK; continue;
13536 case '#': flags |= F_ALT; continue;
13537 case '0': flags |= F_ZERO; continue;
13538 }
13539 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013540 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013541 if (c == '*') {
13542 v = getnextarg(args, arglen, &argidx);
13543 if (v == NULL)
13544 goto onError;
13545 if (!PyLong_Check(v)) {
13546 PyErr_SetString(PyExc_TypeError,
13547 "* wants int");
13548 goto onError;
13549 }
13550 width = PyLong_AsLong(v);
13551 if (width == -1 && PyErr_Occurred())
13552 goto onError;
13553 if (width < 0) {
13554 flags |= F_LJUST;
13555 width = -width;
13556 }
13557 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013558 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013559 }
13560 else if (c >= '0' && c <= '9') {
13561 width = c - '0';
13562 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013563 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013564 if (c < '0' || c > '9')
13565 break;
Martin v. Löwisb05c0732012-05-15 13:45:49 +020013566 /* Since c is unsigned, the RHS would end up as unsigned,
13567 mixing signed and unsigned comparison. Since c is between
13568 '0' and '9', casting to int is safe. */
13569 if (width > (PY_SSIZE_T_MAX - ((int)c - '0')) / 10) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013570 PyErr_SetString(PyExc_ValueError,
13571 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013572 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013573 }
13574 width = width*10 + (c - '0');
13575 }
13576 }
13577 if (c == '.') {
13578 prec = 0;
13579 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013580 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013581 if (c == '*') {
13582 v = getnextarg(args, arglen, &argidx);
13583 if (v == NULL)
13584 goto onError;
13585 if (!PyLong_Check(v)) {
13586 PyErr_SetString(PyExc_TypeError,
13587 "* wants int");
13588 goto onError;
13589 }
13590 prec = PyLong_AsLong(v);
13591 if (prec == -1 && PyErr_Occurred())
13592 goto onError;
13593 if (prec < 0)
13594 prec = 0;
13595 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013596 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013597 }
13598 else if (c >= '0' && c <= '9') {
13599 prec = c - '0';
13600 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013601 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013602 if (c < '0' || c > '9')
13603 break;
Martin v. Löwisb05c0732012-05-15 13:45:49 +020013604 if (prec > (INT_MAX - ((int)c - '0')) / 10) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013605 PyErr_SetString(PyExc_ValueError,
13606 "prec too big");
13607 goto onError;
13608 }
13609 prec = prec*10 + (c - '0');
13610 }
13611 }
13612 } /* prec */
13613 if (fmtcnt >= 0) {
13614 if (c == 'h' || c == 'l' || c == 'L') {
13615 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013616 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013617 }
13618 }
13619 if (fmtcnt < 0) {
13620 PyErr_SetString(PyExc_ValueError,
13621 "incomplete format");
13622 goto onError;
13623 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020013624 if (fmtcnt == 0)
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013625 writer.overallocate = 0;
Victor Stinneraff3cc62012-04-30 05:19:21 +020013626
13627 if (c == '%') {
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013628 if (_PyUnicodeWriter_Prepare(&writer, 1, '%') == -1)
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013629 goto onError;
Victor Stinneree4544c2012-05-09 22:24:08 +020013630 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '%');
13631 writer.pos += 1;
Victor Stinneraff3cc62012-04-30 05:19:21 +020013632 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013633 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020013634
Victor Stinneraff3cc62012-04-30 05:19:21 +020013635 v = getnextarg(args, arglen, &argidx);
13636 if (v == NULL)
13637 goto onError;
13638
Benjamin Peterson29060642009-01-31 22:14:21 +000013639 sign = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013640 signchar = '\0';
Benjamin Peterson29060642009-01-31 22:14:21 +000013641 fill = ' ';
13642 switch (c) {
13643
Benjamin Peterson29060642009-01-31 22:14:21 +000013644 case 's':
13645 case 'r':
13646 case 'a':
Victor Stinnerd3f08822012-05-29 12:57:52 +020013647 if (PyLong_CheckExact(v) && width == -1 && prec == -1) {
13648 /* Fast path */
13649 if (_PyLong_FormatWriter(&writer, v, 10, flags & F_ALT) == -1)
13650 goto onError;
13651 goto nextarg;
13652 }
13653
Victor Stinner808fc0a2010-03-22 12:50:40 +000013654 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013655 temp = v;
13656 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013657 }
13658 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013659 if (c == 's')
13660 temp = PyObject_Str(v);
13661 else if (c == 'r')
13662 temp = PyObject_Repr(v);
13663 else
13664 temp = PyObject_ASCII(v);
Benjamin Peterson29060642009-01-31 22:14:21 +000013665 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013666 break;
13667
13668 case 'i':
13669 case 'd':
13670 case 'u':
13671 case 'o':
13672 case 'x':
13673 case 'X':
Victor Stinnerd3f08822012-05-29 12:57:52 +020013674 if (PyLong_CheckExact(v)
13675 && width == -1 && prec == -1
13676 && !(flags & (F_SIGN | F_BLANK)))
13677 {
13678 /* Fast path */
13679 switch(c)
13680 {
13681 case 'd':
13682 case 'i':
13683 case 'u':
13684 if (_PyLong_FormatWriter(&writer, v, 10, flags & F_ALT) == -1)
13685 goto onError;
13686 goto nextarg;
13687 case 'x':
13688 if (_PyLong_FormatWriter(&writer, v, 16, flags & F_ALT) == -1)
13689 goto onError;
13690 goto nextarg;
13691 case 'o':
13692 if (_PyLong_FormatWriter(&writer, v, 8, flags & F_ALT) == -1)
13693 goto onError;
13694 goto nextarg;
13695 default:
13696 break;
13697 }
13698 }
13699
Benjamin Peterson29060642009-01-31 22:14:21 +000013700 isnumok = 0;
13701 if (PyNumber_Check(v)) {
13702 PyObject *iobj=NULL;
13703
13704 if (PyLong_Check(v)) {
13705 iobj = v;
13706 Py_INCREF(iobj);
13707 }
13708 else {
13709 iobj = PyNumber_Long(v);
13710 }
13711 if (iobj!=NULL) {
13712 if (PyLong_Check(iobj)) {
13713 isnumok = 1;
Victor Stinneraff3cc62012-04-30 05:19:21 +020013714 sign = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013715 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013716 Py_DECREF(iobj);
Benjamin Peterson29060642009-01-31 22:14:21 +000013717 }
13718 else {
13719 Py_DECREF(iobj);
13720 }
13721 }
13722 }
13723 if (!isnumok) {
13724 PyErr_Format(PyExc_TypeError,
13725 "%%%c format: a number is required, "
13726 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13727 goto onError;
13728 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013729 if (flags & F_ZERO)
Benjamin Peterson29060642009-01-31 22:14:21 +000013730 fill = '0';
13731 break;
13732
13733 case 'e':
13734 case 'E':
13735 case 'f':
13736 case 'F':
13737 case 'g':
13738 case 'G':
Victor Stinnerd3f08822012-05-29 12:57:52 +020013739 if (width == -1 && prec == -1
13740 && !(flags & (F_SIGN | F_BLANK)))
13741 {
13742 /* Fast path */
13743 if (formatfloat(v, flags, prec, c, NULL, &writer) == -1)
13744 goto onError;
13745 goto nextarg;
13746 }
13747
Benjamin Peterson29060642009-01-31 22:14:21 +000013748 sign = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013749 if (flags & F_ZERO)
Benjamin Peterson29060642009-01-31 22:14:21 +000013750 fill = '0';
Victor Stinnerd3f08822012-05-29 12:57:52 +020013751 if (formatfloat(v, flags, prec, c, &temp, NULL) == -1)
13752 temp = NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000013753 break;
13754
13755 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013756 {
13757 Py_UCS4 ch = formatchar(v);
13758 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013759 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013760 if (width == -1 && prec == -1) {
13761 /* Fast path */
13762 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
13763 goto onError;
13764 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
13765 writer.pos += 1;
13766 goto nextarg;
13767 }
Victor Stinnerb5c3ea32012-05-02 00:29:36 +020013768 temp = PyUnicode_FromOrdinal(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +000013769 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013770 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013771
13772 default:
13773 PyErr_Format(PyExc_ValueError,
13774 "unsupported format character '%c' (0x%x) "
13775 "at index %zd",
13776 (31<=c && c<=126) ? (char)c : '?',
13777 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013778 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013779 goto onError;
13780 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020013781 if (temp == NULL)
13782 goto onError;
13783 assert (PyUnicode_Check(temp));
Victor Stinnerd3f08822012-05-29 12:57:52 +020013784
13785 if (width == -1 && prec == -1
13786 && !(flags & (F_SIGN | F_BLANK)))
13787 {
13788 /* Fast path */
13789 if (_PyUnicodeWriter_WriteStr(&writer, temp) == -1)
13790 goto onError;
13791 goto nextarg;
13792 }
13793
Victor Stinneraff3cc62012-04-30 05:19:21 +020013794 if (PyUnicode_READY(temp) == -1) {
13795 Py_CLEAR(temp);
13796 goto onError;
13797 }
13798 kind = PyUnicode_KIND(temp);
13799 pbuf = PyUnicode_DATA(temp);
13800 len = PyUnicode_GET_LENGTH(temp);
13801
13802 if (c == 's' || c == 'r' || c == 'a') {
13803 if (prec >= 0 && len > prec)
13804 len = prec;
13805 }
13806
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013807 /* pbuf is initialized here. */
13808 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013809 if (sign) {
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013810 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
13811 if (ch == '-' || ch == '+') {
13812 signchar = ch;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013813 len--;
13814 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013815 }
13816 else if (flags & F_SIGN)
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013817 signchar = '+';
Benjamin Peterson29060642009-01-31 22:14:21 +000013818 else if (flags & F_BLANK)
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013819 signchar = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +000013820 else
13821 sign = 0;
13822 }
13823 if (width < len)
13824 width = len;
Victor Stinneree4544c2012-05-09 22:24:08 +020013825
13826 /* Compute the length and maximum character of the
13827 written characters */
13828 bufmaxchar = 127;
13829 if (!(flags & F_LJUST)) {
13830 if (sign) {
13831 if ((width-1) > len)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013832 bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill);
Victor Stinneree4544c2012-05-09 22:24:08 +020013833 }
13834 else {
13835 if (width > len)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013836 bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill);
Victor Stinneree4544c2012-05-09 22:24:08 +020013837 }
13838 }
13839 maxchar = _PyUnicode_FindMaxChar(temp, 0, pindex+len);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013840 bufmaxchar = MAX_MAXCHAR(bufmaxchar, maxchar);
Victor Stinneree4544c2012-05-09 22:24:08 +020013841
13842 buflen = width;
13843 if (sign && len == width)
13844 buflen++;
13845
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013846 if (_PyUnicodeWriter_Prepare(&writer, buflen, bufmaxchar) == -1)
Victor Stinneree4544c2012-05-09 22:24:08 +020013847 goto onError;
13848
13849 /* Write characters */
Benjamin Peterson29060642009-01-31 22:14:21 +000013850 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013851 if (fill != ' ') {
Victor Stinneree4544c2012-05-09 22:24:08 +020013852 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, signchar);
13853 writer.pos += 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013854 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013855 if (width > len)
13856 width--;
13857 }
13858 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013859 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013860 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013861 if (fill != ' ') {
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013862 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '0');
13863 PyUnicode_WRITE(writer.kind, writer.data, writer.pos+1, c);
13864 writer.pos += 2;
13865 pindex += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +000013866 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013867 width -= 2;
13868 if (width < 0)
13869 width = 0;
13870 len -= 2;
13871 }
13872 if (width > len && !(flags & F_LJUST)) {
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013873 sublen = width - len;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013874 FILL(writer.kind, writer.data, fill, writer.pos, sublen);
13875 writer.pos += sublen;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013876 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013877 }
13878 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013879 if (sign) {
Victor Stinneree4544c2012-05-09 22:24:08 +020013880 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, signchar);
13881 writer.pos += 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013882 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013883 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013884 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13885 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013886 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '0');
13887 PyUnicode_WRITE(writer.kind, writer.data, writer.pos+1, c);
13888 writer.pos += 2;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013889 pindex += 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013890 }
13891 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013892
Victor Stinnerd3f08822012-05-29 12:57:52 +020013893 _PyUnicode_FastCopyCharacters(writer.buffer, writer.pos,
13894 temp, pindex, len);
Victor Stinneree4544c2012-05-09 22:24:08 +020013895 writer.pos += len;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013896 if (width > len) {
Victor Stinneree4544c2012-05-09 22:24:08 +020013897 sublen = width - len;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013898 FILL(writer.kind, writer.data, ' ', writer.pos, sublen);
13899 writer.pos += sublen;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013900 }
Victor Stinneree4544c2012-05-09 22:24:08 +020013901
Victor Stinnerd3f08822012-05-29 12:57:52 +020013902nextarg:
Benjamin Peterson29060642009-01-31 22:14:21 +000013903 if (dict && (argidx < arglen) && c != '%') {
13904 PyErr_SetString(PyExc_TypeError,
13905 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013906 goto onError;
13907 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013908 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013909 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013910 } /* until end */
13911 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013912 PyErr_SetString(PyExc_TypeError,
13913 "not all arguments converted during string formatting");
13914 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013915 }
13916
13917 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013918 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013919 }
13920 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013921 Py_XDECREF(temp);
13922 Py_XDECREF(second);
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013923 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013924
Benjamin Peterson29060642009-01-31 22:14:21 +000013925 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013926 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013927 Py_XDECREF(temp);
13928 Py_XDECREF(second);
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013929 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013930 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013931 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013932 }
13933 return NULL;
13934}
13935
Jeremy Hylton938ace62002-07-17 16:30:39 +000013936static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013937unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13938
Tim Peters6d6c1a32001-08-02 04:15:00 +000013939static PyObject *
13940unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13941{
Benjamin Peterson29060642009-01-31 22:14:21 +000013942 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013943 static char *kwlist[] = {"object", "encoding", "errors", 0};
13944 char *encoding = NULL;
13945 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013946
Benjamin Peterson14339b62009-01-31 16:36:08 +000013947 if (type != &PyUnicode_Type)
13948 return unicode_subtype_new(type, args, kwds);
13949 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013950 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013951 return NULL;
Victor Stinner382955f2011-12-11 21:44:00 +010013952 if (x == NULL) {
13953 Py_INCREF(unicode_empty);
13954 return unicode_empty;
13955 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013956 if (encoding == NULL && errors == NULL)
13957 return PyObject_Str(x);
13958 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013959 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013960}
13961
Guido van Rossume023fe02001-08-30 03:12:59 +000013962static PyObject *
13963unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13964{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013965 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013966 Py_ssize_t length, char_size;
13967 int share_wstr, share_utf8;
13968 unsigned int kind;
13969 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013970
Benjamin Peterson14339b62009-01-31 16:36:08 +000013971 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013972
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013973 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013974 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013975 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013976 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050013977 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060013978 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013979 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060013980 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013981
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013982 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013983 if (self == NULL) {
13984 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013985 return NULL;
13986 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013987 kind = PyUnicode_KIND(unicode);
13988 length = PyUnicode_GET_LENGTH(unicode);
13989
13990 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013991#ifdef Py_DEBUG
13992 _PyUnicode_HASH(self) = -1;
13993#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013994 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013995#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013996 _PyUnicode_STATE(self).interned = 0;
13997 _PyUnicode_STATE(self).kind = kind;
13998 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013999 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014000 _PyUnicode_STATE(self).ready = 1;
14001 _PyUnicode_WSTR(self) = NULL;
14002 _PyUnicode_UTF8_LENGTH(self) = 0;
14003 _PyUnicode_UTF8(self) = NULL;
14004 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014005 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014006
14007 share_utf8 = 0;
14008 share_wstr = 0;
14009 if (kind == PyUnicode_1BYTE_KIND) {
14010 char_size = 1;
14011 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14012 share_utf8 = 1;
14013 }
14014 else if (kind == PyUnicode_2BYTE_KIND) {
14015 char_size = 2;
14016 if (sizeof(wchar_t) == 2)
14017 share_wstr = 1;
14018 }
14019 else {
14020 assert(kind == PyUnicode_4BYTE_KIND);
14021 char_size = 4;
14022 if (sizeof(wchar_t) == 4)
14023 share_wstr = 1;
14024 }
14025
14026 /* Ensure we won't overflow the length. */
14027 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14028 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014029 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014030 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014031 data = PyObject_MALLOC((length + 1) * char_size);
14032 if (data == NULL) {
14033 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014034 goto onError;
14035 }
14036
Victor Stinnerc3c74152011-10-02 20:39:55 +020014037 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014038 if (share_utf8) {
14039 _PyUnicode_UTF8_LENGTH(self) = length;
14040 _PyUnicode_UTF8(self) = data;
14041 }
14042 if (share_wstr) {
14043 _PyUnicode_WSTR_LENGTH(self) = length;
14044 _PyUnicode_WSTR(self) = (wchar_t *)data;
14045 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014046
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014047 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014048 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014049 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014050#ifdef Py_DEBUG
14051 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14052#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014053 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014054 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014055
14056onError:
14057 Py_DECREF(unicode);
14058 Py_DECREF(self);
14059 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014060}
14061
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014062PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000014063 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014064\n\
Collin Winterd474ce82007-08-07 19:42:11 +000014065Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000014066encoding defaults to the current default string encoding.\n\
14067errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014068
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014069static PyObject *unicode_iter(PyObject *seq);
14070
Guido van Rossumd57fd912000-03-10 22:53:23 +000014071PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014072 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014073 "str", /* tp_name */
14074 sizeof(PyUnicodeObject), /* tp_size */
14075 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014076 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014077 (destructor)unicode_dealloc, /* tp_dealloc */
14078 0, /* tp_print */
14079 0, /* tp_getattr */
14080 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014081 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014082 unicode_repr, /* tp_repr */
14083 &unicode_as_number, /* tp_as_number */
14084 &unicode_as_sequence, /* tp_as_sequence */
14085 &unicode_as_mapping, /* tp_as_mapping */
14086 (hashfunc) unicode_hash, /* tp_hash*/
14087 0, /* tp_call*/
14088 (reprfunc) unicode_str, /* tp_str */
14089 PyObject_GenericGetAttr, /* tp_getattro */
14090 0, /* tp_setattro */
14091 0, /* tp_as_buffer */
14092 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014093 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014094 unicode_doc, /* tp_doc */
14095 0, /* tp_traverse */
14096 0, /* tp_clear */
14097 PyUnicode_RichCompare, /* tp_richcompare */
14098 0, /* tp_weaklistoffset */
14099 unicode_iter, /* tp_iter */
14100 0, /* tp_iternext */
14101 unicode_methods, /* tp_methods */
14102 0, /* tp_members */
14103 0, /* tp_getset */
14104 &PyBaseObject_Type, /* tp_base */
14105 0, /* tp_dict */
14106 0, /* tp_descr_get */
14107 0, /* tp_descr_set */
14108 0, /* tp_dictoffset */
14109 0, /* tp_init */
14110 0, /* tp_alloc */
14111 unicode_new, /* tp_new */
14112 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014113};
14114
14115/* Initialize the Unicode implementation */
14116
Victor Stinner3a50e702011-10-18 21:21:00 +020014117int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014118{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014119 int i;
14120
Thomas Wouters477c8d52006-05-27 19:21:47 +000014121 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014122 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014123 0x000A, /* LINE FEED */
14124 0x000D, /* CARRIAGE RETURN */
14125 0x001C, /* FILE SEPARATOR */
14126 0x001D, /* GROUP SEPARATOR */
14127 0x001E, /* RECORD SEPARATOR */
14128 0x0085, /* NEXT LINE */
14129 0x2028, /* LINE SEPARATOR */
14130 0x2029, /* PARAGRAPH SEPARATOR */
14131 };
14132
Fred Drakee4315f52000-05-09 19:53:39 +000014133 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020014134 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014135 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014136 Py_FatalError("Can't create empty string");
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010014137 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014138
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014139 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000014140 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000014141 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014142 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014143
14144 /* initialize the linebreak bloom filter */
14145 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014146 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014147 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014148
14149 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020014150
14151#ifdef HAVE_MBCS
14152 winver.dwOSVersionInfoSize = sizeof(winver);
14153 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14154 PyErr_SetFromWindowsErr(0);
14155 return -1;
14156 }
14157#endif
14158 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014159}
14160
14161/* Finalize the Unicode implementation */
14162
Christian Heimesa156e092008-02-16 07:38:31 +000014163int
14164PyUnicode_ClearFreeList(void)
14165{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014166 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014167}
14168
Guido van Rossumd57fd912000-03-10 22:53:23 +000014169void
Thomas Wouters78890102000-07-22 19:25:51 +000014170_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014171{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014172 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014173
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000014174 Py_XDECREF(unicode_empty);
14175 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014176
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014177 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014178 if (unicode_latin1[i]) {
14179 Py_DECREF(unicode_latin1[i]);
14180 unicode_latin1[i] = NULL;
14181 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014182 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014183 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014184 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014185}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014186
Walter Dörwald16807132007-05-25 13:52:07 +000014187void
14188PyUnicode_InternInPlace(PyObject **p)
14189{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014190 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014191 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014192#ifdef Py_DEBUG
14193 assert(s != NULL);
14194 assert(_PyUnicode_CHECK(s));
14195#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014196 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014197 return;
14198#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014199 /* If it's a subclass, we don't really know what putting
14200 it in the interned dict might do. */
14201 if (!PyUnicode_CheckExact(s))
14202 return;
14203 if (PyUnicode_CHECK_INTERNED(s))
14204 return;
14205 if (interned == NULL) {
14206 interned = PyDict_New();
14207 if (interned == NULL) {
14208 PyErr_Clear(); /* Don't leave an exception */
14209 return;
14210 }
14211 }
14212 /* It might be that the GetItem call fails even
14213 though the key is present in the dictionary,
14214 namely when this happens during a stack overflow. */
14215 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014216 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014217 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014218
Benjamin Peterson29060642009-01-31 22:14:21 +000014219 if (t) {
14220 Py_INCREF(t);
14221 Py_DECREF(*p);
14222 *p = t;
14223 return;
14224 }
Walter Dörwald16807132007-05-25 13:52:07 +000014225
Benjamin Peterson14339b62009-01-31 16:36:08 +000014226 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010014227 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014228 PyErr_Clear();
14229 PyThreadState_GET()->recursion_critical = 0;
14230 return;
14231 }
14232 PyThreadState_GET()->recursion_critical = 0;
14233 /* The two references in interned are not counted by refcnt.
14234 The deallocator will take care of this */
14235 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014236 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014237}
14238
14239void
14240PyUnicode_InternImmortal(PyObject **p)
14241{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014242 PyUnicode_InternInPlace(p);
14243 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014244 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014245 Py_INCREF(*p);
14246 }
Walter Dörwald16807132007-05-25 13:52:07 +000014247}
14248
14249PyObject *
14250PyUnicode_InternFromString(const char *cp)
14251{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014252 PyObject *s = PyUnicode_FromString(cp);
14253 if (s == NULL)
14254 return NULL;
14255 PyUnicode_InternInPlace(&s);
14256 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014257}
14258
Alexander Belopolsky40018472011-02-26 01:02:56 +000014259void
14260_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014261{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014262 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014263 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014264 Py_ssize_t i, n;
14265 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014266
Benjamin Peterson14339b62009-01-31 16:36:08 +000014267 if (interned == NULL || !PyDict_Check(interned))
14268 return;
14269 keys = PyDict_Keys(interned);
14270 if (keys == NULL || !PyList_Check(keys)) {
14271 PyErr_Clear();
14272 return;
14273 }
Walter Dörwald16807132007-05-25 13:52:07 +000014274
Benjamin Peterson14339b62009-01-31 16:36:08 +000014275 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14276 detector, interned unicode strings are not forcibly deallocated;
14277 rather, we give them their stolen references back, and then clear
14278 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014279
Benjamin Peterson14339b62009-01-31 16:36:08 +000014280 n = PyList_GET_SIZE(keys);
14281 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014282 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014283 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014284 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014285 if (PyUnicode_READY(s) == -1) {
14286 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014287 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014288 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014289 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014290 case SSTATE_NOT_INTERNED:
14291 /* XXX Shouldn't happen */
14292 break;
14293 case SSTATE_INTERNED_IMMORTAL:
14294 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014295 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014296 break;
14297 case SSTATE_INTERNED_MORTAL:
14298 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014299 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014300 break;
14301 default:
14302 Py_FatalError("Inconsistent interned string state.");
14303 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014304 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014305 }
14306 fprintf(stderr, "total size of all interned strings: "
14307 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14308 "mortal/immortal\n", mortal_size, immortal_size);
14309 Py_DECREF(keys);
14310 PyDict_Clear(interned);
14311 Py_DECREF(interned);
14312 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000014313}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014314
14315
14316/********************* Unicode Iterator **************************/
14317
14318typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014319 PyObject_HEAD
14320 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014321 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014322} unicodeiterobject;
14323
14324static void
14325unicodeiter_dealloc(unicodeiterobject *it)
14326{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014327 _PyObject_GC_UNTRACK(it);
14328 Py_XDECREF(it->it_seq);
14329 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014330}
14331
14332static int
14333unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14334{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014335 Py_VISIT(it->it_seq);
14336 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014337}
14338
14339static PyObject *
14340unicodeiter_next(unicodeiterobject *it)
14341{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014342 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014343
Benjamin Peterson14339b62009-01-31 16:36:08 +000014344 assert(it != NULL);
14345 seq = it->it_seq;
14346 if (seq == NULL)
14347 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014348 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014349
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014350 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14351 int kind = PyUnicode_KIND(seq);
14352 void *data = PyUnicode_DATA(seq);
14353 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14354 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014355 if (item != NULL)
14356 ++it->it_index;
14357 return item;
14358 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014359
Benjamin Peterson14339b62009-01-31 16:36:08 +000014360 Py_DECREF(seq);
14361 it->it_seq = NULL;
14362 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014363}
14364
14365static PyObject *
14366unicodeiter_len(unicodeiterobject *it)
14367{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014368 Py_ssize_t len = 0;
14369 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014370 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014371 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014372}
14373
14374PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14375
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014376static PyObject *
14377unicodeiter_reduce(unicodeiterobject *it)
14378{
14379 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020014380 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014381 it->it_seq, it->it_index);
14382 } else {
14383 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
14384 if (u == NULL)
14385 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020014386 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014387 }
14388}
14389
14390PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14391
14392static PyObject *
14393unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
14394{
14395 Py_ssize_t index = PyLong_AsSsize_t(state);
14396 if (index == -1 && PyErr_Occurred())
14397 return NULL;
14398 if (index < 0)
14399 index = 0;
14400 it->it_index = index;
14401 Py_RETURN_NONE;
14402}
14403
14404PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14405
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014406static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014407 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014408 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014409 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
14410 reduce_doc},
14411 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
14412 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014413 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014414};
14415
14416PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014417 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14418 "str_iterator", /* tp_name */
14419 sizeof(unicodeiterobject), /* tp_basicsize */
14420 0, /* tp_itemsize */
14421 /* methods */
14422 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14423 0, /* tp_print */
14424 0, /* tp_getattr */
14425 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014426 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014427 0, /* tp_repr */
14428 0, /* tp_as_number */
14429 0, /* tp_as_sequence */
14430 0, /* tp_as_mapping */
14431 0, /* tp_hash */
14432 0, /* tp_call */
14433 0, /* tp_str */
14434 PyObject_GenericGetAttr, /* tp_getattro */
14435 0, /* tp_setattro */
14436 0, /* tp_as_buffer */
14437 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14438 0, /* tp_doc */
14439 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14440 0, /* tp_clear */
14441 0, /* tp_richcompare */
14442 0, /* tp_weaklistoffset */
14443 PyObject_SelfIter, /* tp_iter */
14444 (iternextfunc)unicodeiter_next, /* tp_iternext */
14445 unicodeiter_methods, /* tp_methods */
14446 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014447};
14448
14449static PyObject *
14450unicode_iter(PyObject *seq)
14451{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014452 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014453
Benjamin Peterson14339b62009-01-31 16:36:08 +000014454 if (!PyUnicode_Check(seq)) {
14455 PyErr_BadInternalCall();
14456 return NULL;
14457 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014458 if (PyUnicode_READY(seq) == -1)
14459 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014460 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14461 if (it == NULL)
14462 return NULL;
14463 it->it_index = 0;
14464 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014465 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014466 _PyObject_GC_TRACK(it);
14467 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014468}
14469
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014470
14471size_t
14472Py_UNICODE_strlen(const Py_UNICODE *u)
14473{
14474 int res = 0;
14475 while(*u++)
14476 res++;
14477 return res;
14478}
14479
14480Py_UNICODE*
14481Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14482{
14483 Py_UNICODE *u = s1;
14484 while ((*u++ = *s2++));
14485 return s1;
14486}
14487
14488Py_UNICODE*
14489Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14490{
14491 Py_UNICODE *u = s1;
14492 while ((*u++ = *s2++))
14493 if (n-- == 0)
14494 break;
14495 return s1;
14496}
14497
14498Py_UNICODE*
14499Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14500{
14501 Py_UNICODE *u1 = s1;
14502 u1 += Py_UNICODE_strlen(u1);
14503 Py_UNICODE_strcpy(u1, s2);
14504 return s1;
14505}
14506
14507int
14508Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14509{
14510 while (*s1 && *s2 && *s1 == *s2)
14511 s1++, s2++;
14512 if (*s1 && *s2)
14513 return (*s1 < *s2) ? -1 : +1;
14514 if (*s1)
14515 return 1;
14516 if (*s2)
14517 return -1;
14518 return 0;
14519}
14520
14521int
14522Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14523{
14524 register Py_UNICODE u1, u2;
14525 for (; n != 0; n--) {
14526 u1 = *s1;
14527 u2 = *s2;
14528 if (u1 != u2)
14529 return (u1 < u2) ? -1 : +1;
14530 if (u1 == '\0')
14531 return 0;
14532 s1++;
14533 s2++;
14534 }
14535 return 0;
14536}
14537
14538Py_UNICODE*
14539Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14540{
14541 const Py_UNICODE *p;
14542 for (p = s; *p; p++)
14543 if (*p == c)
14544 return (Py_UNICODE*)p;
14545 return NULL;
14546}
14547
14548Py_UNICODE*
14549Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14550{
14551 const Py_UNICODE *p;
14552 p = s + Py_UNICODE_strlen(s);
14553 while (p != s) {
14554 p--;
14555 if (*p == c)
14556 return (Py_UNICODE*)p;
14557 }
14558 return NULL;
14559}
Victor Stinner331ea922010-08-10 16:37:20 +000014560
Victor Stinner71133ff2010-09-01 23:43:53 +000014561Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014562PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014563{
Victor Stinner577db2c2011-10-11 22:12:48 +020014564 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014565 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014566
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014567 if (!PyUnicode_Check(unicode)) {
14568 PyErr_BadArgument();
14569 return NULL;
14570 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014571 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014572 if (u == NULL)
14573 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014574 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014575 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014576 PyErr_NoMemory();
14577 return NULL;
14578 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014579 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014580 size *= sizeof(Py_UNICODE);
14581 copy = PyMem_Malloc(size);
14582 if (copy == NULL) {
14583 PyErr_NoMemory();
14584 return NULL;
14585 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014586 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014587 return copy;
14588}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014589
Georg Brandl66c221e2010-10-14 07:04:07 +000014590/* A _string module, to export formatter_parser and formatter_field_name_split
14591 to the string.Formatter class implemented in Python. */
14592
14593static PyMethodDef _string_methods[] = {
14594 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14595 METH_O, PyDoc_STR("split the argument as a field name")},
14596 {"formatter_parser", (PyCFunction) formatter_parser,
14597 METH_O, PyDoc_STR("parse the argument as a format string")},
14598 {NULL, NULL}
14599};
14600
14601static struct PyModuleDef _string_module = {
14602 PyModuleDef_HEAD_INIT,
14603 "_string",
14604 PyDoc_STR("string helper module"),
14605 0,
14606 _string_methods,
14607 NULL,
14608 NULL,
14609 NULL,
14610 NULL
14611};
14612
14613PyMODINIT_FUNC
14614PyInit__string(void)
14615{
14616 return PyModule_Create(&_string_module);
14617}
14618
14619
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014620#ifdef __cplusplus
14621}
14622#endif