blob: 80a583cb68ed33e08db7c65edd9890554f623c7f [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Endianness switches; defaults to little endian */
51
52#ifdef WORDS_BIGENDIAN
53# define BYTEORDER_IS_BIG_ENDIAN
54#else
55# define BYTEORDER_IS_LITTLE_ENDIAN
56#endif
57
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000058/* --- Globals ------------------------------------------------------------
59
60 The globals are initialized by the _PyUnicode_Init() API and should
61 not be used before calling that API.
62
63*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000064
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000065
66#ifdef __cplusplus
67extern "C" {
68#endif
69
Victor Stinner8faf8212011-12-08 22:14:11 +010070/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
71#define MAX_UNICODE 0x10ffff
72
Victor Stinner910337b2011-10-03 03:20:16 +020073#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020074# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020075#else
76# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
77#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020078
Victor Stinnere90fe6a2011-10-01 16:48:13 +020079#define _PyUnicode_UTF8(op) \
80 (((PyCompactUnicodeObject*)(op))->utf8)
81#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020082 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020083 assert(PyUnicode_IS_READY(op)), \
84 PyUnicode_IS_COMPACT_ASCII(op) ? \
85 ((char*)((PyASCIIObject*)(op) + 1)) : \
86 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020087#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020088 (((PyCompactUnicodeObject*)(op))->utf8_length)
89#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020090 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020091 assert(PyUnicode_IS_READY(op)), \
92 PyUnicode_IS_COMPACT_ASCII(op) ? \
93 ((PyASCIIObject*)(op))->length : \
94 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020095#define _PyUnicode_WSTR(op) \
96 (((PyASCIIObject*)(op))->wstr)
97#define _PyUnicode_WSTR_LENGTH(op) \
98 (((PyCompactUnicodeObject*)(op))->wstr_length)
99#define _PyUnicode_LENGTH(op) \
100 (((PyASCIIObject *)(op))->length)
101#define _PyUnicode_STATE(op) \
102 (((PyASCIIObject *)(op))->state)
103#define _PyUnicode_HASH(op) \
104 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200105#define _PyUnicode_KIND(op) \
106 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200107 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200108#define _PyUnicode_GET_LENGTH(op) \
109 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200110 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200111#define _PyUnicode_DATA_ANY(op) \
112 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200113
Victor Stinnere6abb482012-05-02 01:15:40 +0200114/* Optimized version of Py_MAX() to compute the maximum character:
115 use it when your are computing the second argument of PyUnicode_New() */
116#define MAX_MAXCHAR(maxchar1, maxchar2) \
117 ((maxchar1) | (maxchar2))
118
Victor Stinner910337b2011-10-03 03:20:16 +0200119#undef PyUnicode_READY
120#define PyUnicode_READY(op) \
121 (assert(_PyUnicode_CHECK(op)), \
122 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200123 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100124 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200125
Victor Stinnerc379ead2011-10-03 12:52:27 +0200126#define _PyUnicode_SHARE_UTF8(op) \
127 (assert(_PyUnicode_CHECK(op)), \
128 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
129 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
130#define _PyUnicode_SHARE_WSTR(op) \
131 (assert(_PyUnicode_CHECK(op)), \
132 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
133
Victor Stinner829c0ad2011-10-03 01:08:02 +0200134/* true if the Unicode object has an allocated UTF-8 memory block
135 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200136#define _PyUnicode_HAS_UTF8_MEMORY(op) \
137 (assert(_PyUnicode_CHECK(op)), \
138 (!PyUnicode_IS_COMPACT_ASCII(op) \
139 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200140 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
141
Victor Stinner03490912011-10-03 23:45:12 +0200142/* true if the Unicode object has an allocated wstr memory block
143 (not shared with other data) */
144#define _PyUnicode_HAS_WSTR_MEMORY(op) \
145 (assert(_PyUnicode_CHECK(op)), \
146 (_PyUnicode_WSTR(op) && \
147 (!PyUnicode_IS_READY(op) || \
148 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
149
Victor Stinner910337b2011-10-03 03:20:16 +0200150/* Generic helper macro to convert characters of different types.
151 from_type and to_type have to be valid type names, begin and end
152 are pointers to the source characters which should be of type
153 "from_type *". to is a pointer of type "to_type *" and points to the
154 buffer where the result characters are written to. */
155#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
156 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200157 to_type *_to = (to_type *) to; \
158 const from_type *_iter = (begin); \
159 const from_type *_end = (end); \
160 Py_ssize_t n = (_end) - (_iter); \
161 const from_type *_unrolled_end = \
162 _iter + (n & ~ (Py_ssize_t) 3); \
163 while (_iter < (_unrolled_end)) { \
164 _to[0] = (to_type) _iter[0]; \
165 _to[1] = (to_type) _iter[1]; \
166 _to[2] = (to_type) _iter[2]; \
167 _to[3] = (to_type) _iter[3]; \
168 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200169 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200170 while (_iter < (_end)) \
171 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200172 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200173
Walter Dörwald16807132007-05-25 13:52:07 +0000174/* This dictionary holds all interned unicode strings. Note that references
175 to strings in this dictionary are *not* counted in the string's ob_refcnt.
176 When the interned string reaches a refcnt of 0 the string deallocation
177 function will delete the reference from this dictionary.
178
179 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000180 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000181*/
182static PyObject *interned;
183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200185static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000186
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200187/* List of static strings. */
188static _Py_Identifier *static_strings;
189
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190/* Single character Unicode strings in the Latin-1 range are being
191 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200192static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193
Christian Heimes190d79e2008-01-30 11:58:22 +0000194/* Fast detection of the most frequent whitespace characters */
195const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000196 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000197/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000198/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000199/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000200/* case 0x000C: * FORM FEED */
201/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000202 0, 1, 1, 1, 1, 1, 0, 0,
203 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000204/* case 0x001C: * FILE SEPARATOR */
205/* case 0x001D: * GROUP SEPARATOR */
206/* case 0x001E: * RECORD SEPARATOR */
207/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000208 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000209/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 1, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000214
Benjamin Peterson14339b62009-01-31 16:36:08 +0000215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
218 0, 0, 0, 0, 0, 0, 0, 0,
219 0, 0, 0, 0, 0, 0, 0, 0,
220 0, 0, 0, 0, 0, 0, 0, 0,
221 0, 0, 0, 0, 0, 0, 0, 0,
222 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000223};
224
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200225/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200226static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200227static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100228static int unicode_modifiable(PyObject *unicode);
229
Victor Stinnerfe226c02011-10-03 03:52:20 +0200230
Alexander Belopolsky40018472011-02-26 01:02:56 +0000231static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200232_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
233static PyObject *
234_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
235static PyObject *
236_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
237
238static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000239unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000240 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100241 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000242 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
243
Alexander Belopolsky40018472011-02-26 01:02:56 +0000244static void
245raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300246 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100247 PyObject *unicode,
248 Py_ssize_t startpos, Py_ssize_t endpos,
249 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000250
Christian Heimes190d79e2008-01-30 11:58:22 +0000251/* Same for linebreaks */
252static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000253 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000254/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000255/* 0x000B, * LINE TABULATION */
256/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000257/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000258 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000259 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000260/* 0x001C, * FILE SEPARATOR */
261/* 0x001D, * GROUP SEPARATOR */
262/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000263 0, 0, 0, 0, 1, 1, 1, 0,
264 0, 0, 0, 0, 0, 0, 0, 0,
265 0, 0, 0, 0, 0, 0, 0, 0,
266 0, 0, 0, 0, 0, 0, 0, 0,
267 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000268
Benjamin Peterson14339b62009-01-31 16:36:08 +0000269 0, 0, 0, 0, 0, 0, 0, 0,
270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000277};
278
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300279/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
280 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000281Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000282PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000283{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000284#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000285 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000286#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000287 /* This is actually an illegal character, so it should
288 not be passed to unichr. */
289 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000290#endif
291}
292
Victor Stinner910337b2011-10-03 03:20:16 +0200293#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200294int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100295_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200296{
297 PyASCIIObject *ascii;
298 unsigned int kind;
299
300 assert(PyUnicode_Check(op));
301
302 ascii = (PyASCIIObject *)op;
303 kind = ascii->state.kind;
304
Victor Stinnera3b334d2011-10-03 13:53:37 +0200305 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200306 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200307 assert(ascii->state.ready == 1);
308 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200309 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200310 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200311 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200312
Victor Stinnera41463c2011-10-04 01:05:08 +0200313 if (ascii->state.compact == 1) {
314 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200315 assert(kind == PyUnicode_1BYTE_KIND
316 || kind == PyUnicode_2BYTE_KIND
317 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200318 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200319 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200320 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100321 }
322 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200323 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
324
325 data = unicode->data.any;
326 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100327 assert(ascii->length == 0);
328 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200329 assert(ascii->state.compact == 0);
330 assert(ascii->state.ascii == 0);
331 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100332 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200333 assert(ascii->wstr != NULL);
334 assert(data == NULL);
335 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200336 }
337 else {
338 assert(kind == PyUnicode_1BYTE_KIND
339 || kind == PyUnicode_2BYTE_KIND
340 || kind == PyUnicode_4BYTE_KIND);
341 assert(ascii->state.compact == 0);
342 assert(ascii->state.ready == 1);
343 assert(data != NULL);
344 if (ascii->state.ascii) {
345 assert (compact->utf8 == data);
346 assert (compact->utf8_length == ascii->length);
347 }
348 else
349 assert (compact->utf8 != data);
350 }
351 }
352 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200353 if (
354#if SIZEOF_WCHAR_T == 2
355 kind == PyUnicode_2BYTE_KIND
356#else
357 kind == PyUnicode_4BYTE_KIND
358#endif
359 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200360 {
361 assert(ascii->wstr == data);
362 assert(compact->wstr_length == ascii->length);
363 } else
364 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200365 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200366
367 if (compact->utf8 == NULL)
368 assert(compact->utf8_length == 0);
369 if (ascii->wstr == NULL)
370 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200371 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200372 /* check that the best kind is used */
373 if (check_content && kind != PyUnicode_WCHAR_KIND)
374 {
375 Py_ssize_t i;
376 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200377 void *data;
378 Py_UCS4 ch;
379
380 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200381 for (i=0; i < ascii->length; i++)
382 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200383 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200384 if (ch > maxchar)
385 maxchar = ch;
386 }
387 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100388 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200389 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100390 assert(maxchar <= 255);
391 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200392 else
393 assert(maxchar < 128);
394 }
Victor Stinner77faf692011-11-20 18:56:05 +0100395 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200396 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100397 assert(maxchar <= 0xFFFF);
398 }
399 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200400 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100401 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100402 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200403 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200404 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400405 return 1;
406}
Victor Stinner910337b2011-10-03 03:20:16 +0200407#endif
408
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100409static PyObject*
410unicode_result_wchar(PyObject *unicode)
411{
412#ifndef Py_DEBUG
413 Py_ssize_t len;
414
415 assert(Py_REFCNT(unicode) == 1);
416
417 len = _PyUnicode_WSTR_LENGTH(unicode);
418 if (len == 0) {
419 Py_INCREF(unicode_empty);
420 Py_DECREF(unicode);
421 return unicode_empty;
422 }
423
424 if (len == 1) {
425 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
426 if (ch < 256) {
427 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
428 Py_DECREF(unicode);
429 return latin1_char;
430 }
431 }
432
433 if (_PyUnicode_Ready(unicode) < 0) {
434 Py_XDECREF(unicode);
435 return NULL;
436 }
437#else
438 /* don't make the result ready in debug mode to ensure that the caller
439 makes the string ready before using it */
440 assert(_PyUnicode_CheckConsistency(unicode, 1));
441#endif
442 return unicode;
443}
444
445static PyObject*
446unicode_result_ready(PyObject *unicode)
447{
448 Py_ssize_t length;
449
450 length = PyUnicode_GET_LENGTH(unicode);
451 if (length == 0) {
452 if (unicode != unicode_empty) {
453 Py_INCREF(unicode_empty);
454 Py_DECREF(unicode);
455 }
456 return unicode_empty;
457 }
458
459 if (length == 1) {
460 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
461 if (ch < 256) {
462 PyObject *latin1_char = unicode_latin1[ch];
463 if (latin1_char != NULL) {
464 if (unicode != latin1_char) {
465 Py_INCREF(latin1_char);
466 Py_DECREF(unicode);
467 }
468 return latin1_char;
469 }
470 else {
471 assert(_PyUnicode_CheckConsistency(unicode, 1));
472 Py_INCREF(unicode);
473 unicode_latin1[ch] = unicode;
474 return unicode;
475 }
476 }
477 }
478
479 assert(_PyUnicode_CheckConsistency(unicode, 1));
480 return unicode;
481}
482
483static PyObject*
484unicode_result(PyObject *unicode)
485{
486 assert(_PyUnicode_CHECK(unicode));
487 if (PyUnicode_IS_READY(unicode))
488 return unicode_result_ready(unicode);
489 else
490 return unicode_result_wchar(unicode);
491}
492
Victor Stinnerc4b49542011-12-11 22:44:26 +0100493static PyObject*
494unicode_result_unchanged(PyObject *unicode)
495{
496 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500497 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100498 return NULL;
499 Py_INCREF(unicode);
500 return unicode;
501 }
502 else
503 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100504 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100505}
506
Victor Stinner3a50e702011-10-18 21:21:00 +0200507#ifdef HAVE_MBCS
508static OSVERSIONINFOEX winver;
509#endif
510
Thomas Wouters477c8d52006-05-27 19:21:47 +0000511/* --- Bloom Filters ----------------------------------------------------- */
512
513/* stuff to implement simple "bloom filters" for Unicode characters.
514 to keep things simple, we use a single bitmask, using the least 5
515 bits from each unicode characters as the bit index. */
516
517/* the linebreak mask is set up by Unicode_Init below */
518
Antoine Pitrouf068f942010-01-13 14:19:12 +0000519#if LONG_BIT >= 128
520#define BLOOM_WIDTH 128
521#elif LONG_BIT >= 64
522#define BLOOM_WIDTH 64
523#elif LONG_BIT >= 32
524#define BLOOM_WIDTH 32
525#else
526#error "LONG_BIT is smaller than 32"
527#endif
528
Thomas Wouters477c8d52006-05-27 19:21:47 +0000529#define BLOOM_MASK unsigned long
530
531static BLOOM_MASK bloom_linebreak;
532
Antoine Pitrouf068f942010-01-13 14:19:12 +0000533#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
534#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000535
Benjamin Peterson29060642009-01-31 22:14:21 +0000536#define BLOOM_LINEBREAK(ch) \
537 ((ch) < 128U ? ascii_linebreak[(ch)] : \
538 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000539
Alexander Belopolsky40018472011-02-26 01:02:56 +0000540Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200541make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000542{
543 /* calculate simple bloom-style bitmask for a given unicode string */
544
Antoine Pitrouf068f942010-01-13 14:19:12 +0000545 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000546 Py_ssize_t i;
547
548 mask = 0;
549 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200550 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000551
552 return mask;
553}
554
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200555#define BLOOM_MEMBER(mask, chr, str) \
556 (BLOOM(mask, chr) \
557 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000558
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200559/* Compilation of templated routines */
560
561#include "stringlib/asciilib.h"
562#include "stringlib/fastsearch.h"
563#include "stringlib/partition.h"
564#include "stringlib/split.h"
565#include "stringlib/count.h"
566#include "stringlib/find.h"
567#include "stringlib/find_max_char.h"
568#include "stringlib/localeutil.h"
569#include "stringlib/undef.h"
570
571#include "stringlib/ucs1lib.h"
572#include "stringlib/fastsearch.h"
573#include "stringlib/partition.h"
574#include "stringlib/split.h"
575#include "stringlib/count.h"
576#include "stringlib/find.h"
577#include "stringlib/find_max_char.h"
578#include "stringlib/localeutil.h"
579#include "stringlib/undef.h"
580
581#include "stringlib/ucs2lib.h"
582#include "stringlib/fastsearch.h"
583#include "stringlib/partition.h"
584#include "stringlib/split.h"
585#include "stringlib/count.h"
586#include "stringlib/find.h"
587#include "stringlib/find_max_char.h"
588#include "stringlib/localeutil.h"
589#include "stringlib/undef.h"
590
591#include "stringlib/ucs4lib.h"
592#include "stringlib/fastsearch.h"
593#include "stringlib/partition.h"
594#include "stringlib/split.h"
595#include "stringlib/count.h"
596#include "stringlib/find.h"
597#include "stringlib/find_max_char.h"
598#include "stringlib/localeutil.h"
599#include "stringlib/undef.h"
600
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200601#include "stringlib/unicodedefs.h"
602#include "stringlib/fastsearch.h"
603#include "stringlib/count.h"
604#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100605#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200606
Guido van Rossumd57fd912000-03-10 22:53:23 +0000607/* --- Unicode Object ----------------------------------------------------- */
608
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200609static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200610fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200611
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200612Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
613 Py_ssize_t size, Py_UCS4 ch,
614 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200615{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200616 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
617
618 switch (kind) {
619 case PyUnicode_1BYTE_KIND:
620 {
621 Py_UCS1 ch1 = (Py_UCS1) ch;
622 if (ch1 == ch)
623 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
624 else
625 return -1;
626 }
627 case PyUnicode_2BYTE_KIND:
628 {
629 Py_UCS2 ch2 = (Py_UCS2) ch;
630 if (ch2 == ch)
631 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
632 else
633 return -1;
634 }
635 case PyUnicode_4BYTE_KIND:
636 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
637 default:
638 assert(0);
639 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200640 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200641}
642
Victor Stinnerfe226c02011-10-03 03:52:20 +0200643static PyObject*
644resize_compact(PyObject *unicode, Py_ssize_t length)
645{
646 Py_ssize_t char_size;
647 Py_ssize_t struct_size;
648 Py_ssize_t new_size;
649 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100650 PyObject *new_unicode;
Victor Stinner79891572012-05-03 13:43:07 +0200651 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200652 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100653 assert(PyUnicode_IS_COMPACT(unicode));
654
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200655 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100656 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200657 struct_size = sizeof(PyASCIIObject);
658 else
659 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200660 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200661
Victor Stinnerfe226c02011-10-03 03:52:20 +0200662 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
663 PyErr_NoMemory();
664 return NULL;
665 }
666 new_size = (struct_size + (length + 1) * char_size);
667
Victor Stinner84def372011-12-11 20:04:56 +0100668 _Py_DEC_REFTOTAL;
669 _Py_ForgetReference(unicode);
670
671 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
672 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100673 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200674 PyErr_NoMemory();
675 return NULL;
676 }
Victor Stinner84def372011-12-11 20:04:56 +0100677 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200678 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100679
Victor Stinnerfe226c02011-10-03 03:52:20 +0200680 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200681 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200682 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100683 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200684 _PyUnicode_WSTR_LENGTH(unicode) = length;
685 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200686 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
687 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200688 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200689 return unicode;
690}
691
Alexander Belopolsky40018472011-02-26 01:02:56 +0000692static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200693resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000694{
Victor Stinner95663112011-10-04 01:03:50 +0200695 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100696 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200697 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200698 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000699
Victor Stinnerfe226c02011-10-03 03:52:20 +0200700 if (PyUnicode_IS_READY(unicode)) {
701 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200702 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200703 void *data;
704
705 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200706 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200707 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
708 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200709
710 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
711 PyErr_NoMemory();
712 return -1;
713 }
714 new_size = (length + 1) * char_size;
715
Victor Stinner7a9105a2011-12-12 00:13:42 +0100716 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
717 {
718 PyObject_DEL(_PyUnicode_UTF8(unicode));
719 _PyUnicode_UTF8(unicode) = NULL;
720 _PyUnicode_UTF8_LENGTH(unicode) = 0;
721 }
722
Victor Stinnerfe226c02011-10-03 03:52:20 +0200723 data = (PyObject *)PyObject_REALLOC(data, new_size);
724 if (data == NULL) {
725 PyErr_NoMemory();
726 return -1;
727 }
728 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200729 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200730 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200731 _PyUnicode_WSTR_LENGTH(unicode) = length;
732 }
733 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200734 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200735 _PyUnicode_UTF8_LENGTH(unicode) = length;
736 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200737 _PyUnicode_LENGTH(unicode) = length;
738 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200739 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200740 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200741 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200742 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200743 }
Victor Stinner95663112011-10-04 01:03:50 +0200744 assert(_PyUnicode_WSTR(unicode) != NULL);
745
746 /* check for integer overflow */
747 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
748 PyErr_NoMemory();
749 return -1;
750 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100751 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200752 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100753 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200754 if (!wstr) {
755 PyErr_NoMemory();
756 return -1;
757 }
758 _PyUnicode_WSTR(unicode) = wstr;
759 _PyUnicode_WSTR(unicode)[length] = 0;
760 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200761 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000762 return 0;
763}
764
Victor Stinnerfe226c02011-10-03 03:52:20 +0200765static PyObject*
766resize_copy(PyObject *unicode, Py_ssize_t length)
767{
768 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100769 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200770 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100771
Benjamin Petersonbac79492012-01-14 13:34:47 -0500772 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100773 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200774
775 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
776 if (copy == NULL)
777 return NULL;
778
779 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +0200780 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200781 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200782 }
783 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200784 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100785
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200786 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200787 if (w == NULL)
788 return NULL;
789 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
790 copy_length = Py_MIN(copy_length, length);
791 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
792 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200793 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200794 }
795}
796
Guido van Rossumd57fd912000-03-10 22:53:23 +0000797/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000798 Ux0000 terminated; some code (e.g. new_identifier)
799 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000800
801 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000802 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000803
804*/
805
Alexander Belopolsky40018472011-02-26 01:02:56 +0000806static PyUnicodeObject *
807_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000808{
809 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200810 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000811
Thomas Wouters477c8d52006-05-27 19:21:47 +0000812 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000813 if (length == 0 && unicode_empty != NULL) {
814 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200815 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000816 }
817
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000818 /* Ensure we won't overflow the size. */
819 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
820 return (PyUnicodeObject *)PyErr_NoMemory();
821 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200822 if (length < 0) {
823 PyErr_SetString(PyExc_SystemError,
824 "Negative size passed to _PyUnicode_New");
825 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000826 }
827
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200828 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
829 if (unicode == NULL)
830 return NULL;
831 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
832 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
833 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100834 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000835 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100836 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000837 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200838
Jeremy Hyltond8082792003-09-16 19:41:39 +0000839 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000840 * the caller fails before initializing str -- unicode_resize()
841 * reads str[0], and the Keep-Alive optimization can keep memory
842 * allocated for str alive across a call to unicode_dealloc(unicode).
843 * We don't want unicode_resize to read uninitialized memory in
844 * that case.
845 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200846 _PyUnicode_WSTR(unicode)[0] = 0;
847 _PyUnicode_WSTR(unicode)[length] = 0;
848 _PyUnicode_WSTR_LENGTH(unicode) = length;
849 _PyUnicode_HASH(unicode) = -1;
850 _PyUnicode_STATE(unicode).interned = 0;
851 _PyUnicode_STATE(unicode).kind = 0;
852 _PyUnicode_STATE(unicode).compact = 0;
853 _PyUnicode_STATE(unicode).ready = 0;
854 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200855 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200856 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200857 _PyUnicode_UTF8(unicode) = NULL;
858 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100859 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000860 return unicode;
861}
862
Victor Stinnerf42dc442011-10-02 23:33:16 +0200863static const char*
864unicode_kind_name(PyObject *unicode)
865{
Victor Stinner42dfd712011-10-03 14:41:45 +0200866 /* don't check consistency: unicode_kind_name() is called from
867 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200868 if (!PyUnicode_IS_COMPACT(unicode))
869 {
870 if (!PyUnicode_IS_READY(unicode))
871 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600872 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200873 {
874 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200875 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200876 return "legacy ascii";
877 else
878 return "legacy latin1";
879 case PyUnicode_2BYTE_KIND:
880 return "legacy UCS2";
881 case PyUnicode_4BYTE_KIND:
882 return "legacy UCS4";
883 default:
884 return "<legacy invalid kind>";
885 }
886 }
887 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600888 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200889 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200890 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200891 return "ascii";
892 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200893 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200894 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200895 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200896 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200897 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200898 default:
899 return "<invalid compact kind>";
900 }
901}
902
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200903#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200904/* Functions wrapping macros for use in debugger */
905char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200906 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200907}
908
909void *_PyUnicode_compact_data(void *unicode) {
910 return _PyUnicode_COMPACT_DATA(unicode);
911}
912void *_PyUnicode_data(void *unicode){
913 printf("obj %p\n", unicode);
914 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
915 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
916 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
917 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
918 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
919 return PyUnicode_DATA(unicode);
920}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200921
922void
923_PyUnicode_Dump(PyObject *op)
924{
925 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200926 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
927 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
928 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200929
Victor Stinnera849a4b2011-10-03 12:12:11 +0200930 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200931 {
932 if (ascii->state.ascii)
933 data = (ascii + 1);
934 else
935 data = (compact + 1);
936 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200937 else
938 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200939 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
940
Victor Stinnera849a4b2011-10-03 12:12:11 +0200941 if (ascii->wstr == data)
942 printf("shared ");
943 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200944
Victor Stinnera3b334d2011-10-03 13:53:37 +0200945 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200946 printf(" (%zu), ", compact->wstr_length);
947 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
948 printf("shared ");
949 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200950 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200951 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200952}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200953#endif
954
955PyObject *
956PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
957{
958 PyObject *obj;
959 PyCompactUnicodeObject *unicode;
960 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +0200961 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200962 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200963 Py_ssize_t char_size;
964 Py_ssize_t struct_size;
965
966 /* Optimization for empty strings */
967 if (size == 0 && unicode_empty != NULL) {
968 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200969 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200970 }
971
Victor Stinner9e9d6892011-10-04 01:02:02 +0200972 is_ascii = 0;
973 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200974 struct_size = sizeof(PyCompactUnicodeObject);
975 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +0200976 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200977 char_size = 1;
978 is_ascii = 1;
979 struct_size = sizeof(PyASCIIObject);
980 }
981 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +0200982 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200983 char_size = 1;
984 }
985 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +0200986 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200987 char_size = 2;
988 if (sizeof(wchar_t) == 2)
989 is_sharing = 1;
990 }
991 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +0100992 if (maxchar > MAX_UNICODE) {
993 PyErr_SetString(PyExc_SystemError,
994 "invalid maximum character passed to PyUnicode_New");
995 return NULL;
996 }
Victor Stinner8f825062012-04-27 13:55:39 +0200997 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200998 char_size = 4;
999 if (sizeof(wchar_t) == 4)
1000 is_sharing = 1;
1001 }
1002
1003 /* Ensure we won't overflow the size. */
1004 if (size < 0) {
1005 PyErr_SetString(PyExc_SystemError,
1006 "Negative size passed to PyUnicode_New");
1007 return NULL;
1008 }
1009 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1010 return PyErr_NoMemory();
1011
1012 /* Duplicated allocation code from _PyObject_New() instead of a call to
1013 * PyObject_New() so we are able to allocate space for the object and
1014 * it's data buffer.
1015 */
1016 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1017 if (obj == NULL)
1018 return PyErr_NoMemory();
1019 obj = PyObject_INIT(obj, &PyUnicode_Type);
1020 if (obj == NULL)
1021 return NULL;
1022
1023 unicode = (PyCompactUnicodeObject *)obj;
1024 if (is_ascii)
1025 data = ((PyASCIIObject*)obj) + 1;
1026 else
1027 data = unicode + 1;
1028 _PyUnicode_LENGTH(unicode) = size;
1029 _PyUnicode_HASH(unicode) = -1;
1030 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001031 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001032 _PyUnicode_STATE(unicode).compact = 1;
1033 _PyUnicode_STATE(unicode).ready = 1;
1034 _PyUnicode_STATE(unicode).ascii = is_ascii;
1035 if (is_ascii) {
1036 ((char*)data)[size] = 0;
1037 _PyUnicode_WSTR(unicode) = NULL;
1038 }
Victor Stinner8f825062012-04-27 13:55:39 +02001039 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001040 ((char*)data)[size] = 0;
1041 _PyUnicode_WSTR(unicode) = NULL;
1042 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001043 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001044 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001045 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001046 else {
1047 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001048 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001049 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001050 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001051 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001052 ((Py_UCS4*)data)[size] = 0;
1053 if (is_sharing) {
1054 _PyUnicode_WSTR_LENGTH(unicode) = size;
1055 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1056 }
1057 else {
1058 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1059 _PyUnicode_WSTR(unicode) = NULL;
1060 }
1061 }
Victor Stinner8f825062012-04-27 13:55:39 +02001062#ifdef Py_DEBUG
1063 /* Fill the data with invalid characters to detect bugs earlier.
1064 _PyUnicode_CheckConsistency(str, 1) detects invalid characters,
1065 at least for ASCII and UCS-4 strings. U+00FF is invalid in ASCII
1066 and U+FFFFFFFF is an invalid character in Unicode 6.0. */
1067 memset(data, 0xff, size * kind);
1068#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001069 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001070 return obj;
1071}
1072
1073#if SIZEOF_WCHAR_T == 2
1074/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1075 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001076 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001077
1078 This function assumes that unicode can hold one more code point than wstr
1079 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001080static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001081unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001082 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001083{
1084 const wchar_t *iter;
1085 Py_UCS4 *ucs4_out;
1086
Victor Stinner910337b2011-10-03 03:20:16 +02001087 assert(unicode != NULL);
1088 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001089 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1090 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1091
1092 for (iter = begin; iter < end; ) {
1093 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1094 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001095 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1096 && (iter+1) < end
1097 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001098 {
Victor Stinner551ac952011-11-29 22:58:13 +01001099 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001100 iter += 2;
1101 }
1102 else {
1103 *ucs4_out++ = *iter;
1104 iter++;
1105 }
1106 }
1107 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1108 _PyUnicode_GET_LENGTH(unicode)));
1109
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001110}
1111#endif
1112
Victor Stinnercd9950f2011-10-02 00:34:53 +02001113static int
Victor Stinner488fa492011-12-12 00:01:39 +01001114unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001115{
Victor Stinner488fa492011-12-12 00:01:39 +01001116 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001117 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001118 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001119 return -1;
1120 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001121 return 0;
1122}
1123
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001124static int
1125_copy_characters(PyObject *to, Py_ssize_t to_start,
1126 PyObject *from, Py_ssize_t from_start,
1127 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001128{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001129 unsigned int from_kind, to_kind;
1130 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001131
Victor Stinneree4544c2012-05-09 22:24:08 +02001132 assert(0 <= how_many);
1133 assert(0 <= from_start);
1134 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001135 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001136 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001137 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001138
Victor Stinnerd3f08822012-05-29 12:57:52 +02001139 assert(PyUnicode_Check(to));
1140 assert(PyUnicode_IS_READY(to));
1141 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1142
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001143 if (how_many == 0)
1144 return 0;
1145
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001146 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001147 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001148 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001149 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001150
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001151 if (from_kind == to_kind) {
1152 if (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)) {
1153 /* Writing Latin-1 characters into an ASCII string requires to
1154 check that all written characters are pure ASCII */
1155#ifndef Py_DEBUG
1156 if (check_maxchar) {
1157 Py_UCS4 max_char;
1158 max_char = ucs1lib_find_max_char(from_data,
1159 (char*)from_data + how_many);
1160 if (max_char >= 128)
1161 return -1;
1162 }
1163#else
1164 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1165 Py_UCS4 ch;
1166 Py_ssize_t i;
1167 for (i=0; i < how_many; i++) {
1168 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1169 assert(ch <= to_maxchar);
1170 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001171#endif
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001172 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001173 Py_MEMCPY((char*)to_data + to_kind * to_start,
1174 (char*)from_data + from_kind * from_start,
1175 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001176 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001177 else if (from_kind == PyUnicode_1BYTE_KIND
1178 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001179 {
1180 _PyUnicode_CONVERT_BYTES(
1181 Py_UCS1, Py_UCS2,
1182 PyUnicode_1BYTE_DATA(from) + from_start,
1183 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1184 PyUnicode_2BYTE_DATA(to) + to_start
1185 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001186 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001187 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001188 && to_kind == PyUnicode_4BYTE_KIND)
1189 {
1190 _PyUnicode_CONVERT_BYTES(
1191 Py_UCS1, Py_UCS4,
1192 PyUnicode_1BYTE_DATA(from) + from_start,
1193 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1194 PyUnicode_4BYTE_DATA(to) + to_start
1195 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001196 }
1197 else if (from_kind == PyUnicode_2BYTE_KIND
1198 && to_kind == PyUnicode_4BYTE_KIND)
1199 {
1200 _PyUnicode_CONVERT_BYTES(
1201 Py_UCS2, Py_UCS4,
1202 PyUnicode_2BYTE_DATA(from) + from_start,
1203 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1204 PyUnicode_4BYTE_DATA(to) + to_start
1205 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001206 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001207 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001208 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1209
1210#ifndef Py_DEBUG
1211 if (!check_maxchar) {
1212 if (from_kind == PyUnicode_2BYTE_KIND
1213 && to_kind == PyUnicode_1BYTE_KIND)
1214 {
1215 _PyUnicode_CONVERT_BYTES(
1216 Py_UCS2, Py_UCS1,
1217 PyUnicode_2BYTE_DATA(from) + from_start,
1218 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1219 PyUnicode_1BYTE_DATA(to) + to_start
1220 );
1221 }
1222 else if (from_kind == PyUnicode_4BYTE_KIND
1223 && to_kind == PyUnicode_1BYTE_KIND)
1224 {
1225 _PyUnicode_CONVERT_BYTES(
1226 Py_UCS4, Py_UCS1,
1227 PyUnicode_4BYTE_DATA(from) + from_start,
1228 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1229 PyUnicode_1BYTE_DATA(to) + to_start
1230 );
1231 }
1232 else if (from_kind == PyUnicode_4BYTE_KIND
1233 && to_kind == PyUnicode_2BYTE_KIND)
1234 {
1235 _PyUnicode_CONVERT_BYTES(
1236 Py_UCS4, Py_UCS2,
1237 PyUnicode_4BYTE_DATA(from) + from_start,
1238 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1239 PyUnicode_2BYTE_DATA(to) + to_start
1240 );
1241 }
1242 else {
1243 assert(0);
1244 return -1;
1245 }
1246 }
1247 else
1248#endif
Victor Stinnerf42dc442011-10-02 23:33:16 +02001249 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001250 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001251 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001252 Py_ssize_t i;
1253
Victor Stinnera0702ab2011-09-29 14:14:38 +02001254 for (i=0; i < how_many; i++) {
1255 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001256#ifndef Py_DEBUG
Victor Stinner56c161a2011-10-06 02:47:11 +02001257 assert(ch <= to_maxchar);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001258#else
1259 if (ch > to_maxchar)
1260 return -1;
1261#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001262 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1263 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001264 }
1265 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001266 return 0;
1267}
1268
Victor Stinnerd3f08822012-05-29 12:57:52 +02001269void
1270_PyUnicode_FastCopyCharacters(
1271 PyObject *to, Py_ssize_t to_start,
1272 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001273{
1274 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1275}
1276
1277Py_ssize_t
1278PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1279 PyObject *from, Py_ssize_t from_start,
1280 Py_ssize_t how_many)
1281{
1282 int err;
1283
1284 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1285 PyErr_BadInternalCall();
1286 return -1;
1287 }
1288
Benjamin Petersonbac79492012-01-14 13:34:47 -05001289 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001290 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001291 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001292 return -1;
1293
Victor Stinnerd3f08822012-05-29 12:57:52 +02001294 if (from_start < 0) {
1295 PyErr_SetString(PyExc_IndexError, "string index out of range");
1296 return -1;
1297 }
1298 if (to_start < 0) {
1299 PyErr_SetString(PyExc_IndexError, "string index out of range");
1300 return -1;
1301 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001302 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1303 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1304 PyErr_Format(PyExc_SystemError,
1305 "Cannot write %zi characters at %zi "
1306 "in a string of %zi characters",
1307 how_many, to_start, PyUnicode_GET_LENGTH(to));
1308 return -1;
1309 }
1310
1311 if (how_many == 0)
1312 return 0;
1313
Victor Stinner488fa492011-12-12 00:01:39 +01001314 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001315 return -1;
1316
1317 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1318 if (err) {
1319 PyErr_Format(PyExc_SystemError,
1320 "Cannot copy %s characters "
1321 "into a string of %s characters",
1322 unicode_kind_name(from),
1323 unicode_kind_name(to));
1324 return -1;
1325 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001326 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001327}
1328
Victor Stinner17222162011-09-28 22:15:37 +02001329/* Find the maximum code point and count the number of surrogate pairs so a
1330 correct string length can be computed before converting a string to UCS4.
1331 This function counts single surrogates as a character and not as a pair.
1332
1333 Return 0 on success, or -1 on error. */
1334static int
1335find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1336 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001337{
1338 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001339 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001340
Victor Stinnerc53be962011-10-02 21:33:54 +02001341 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001342 *num_surrogates = 0;
1343 *maxchar = 0;
1344
1345 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001346#if SIZEOF_WCHAR_T == 2
Victor Stinnerca4f2072011-11-22 03:38:40 +01001347 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1348 && (iter+1) < end
1349 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001350 {
Victor Stinner8faf8212011-12-08 22:14:11 +01001351 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001352 ++(*num_surrogates);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001353 iter += 2;
1354 }
1355 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001356#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001357 {
1358 ch = *iter;
1359 iter++;
1360 }
1361 if (ch > *maxchar) {
1362 *maxchar = ch;
1363 if (*maxchar > MAX_UNICODE) {
1364 PyErr_Format(PyExc_ValueError,
1365 "character U+%x is not in range [U+0000; U+10ffff]",
1366 ch);
1367 return -1;
1368 }
1369 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001370 }
1371 return 0;
1372}
1373
1374#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001375static int unicode_ready_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001376#endif
1377
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001378int
1379_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001380{
1381 wchar_t *end;
1382 Py_UCS4 maxchar = 0;
1383 Py_ssize_t num_surrogates;
1384#if SIZEOF_WCHAR_T == 2
1385 Py_ssize_t length_wo_surrogates;
1386#endif
1387
Georg Brandl7597add2011-10-05 16:36:47 +02001388 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001389 strings were created using _PyObject_New() and where no canonical
1390 representation (the str field) has been set yet aka strings
1391 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001392 assert(_PyUnicode_CHECK(unicode));
1393 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001394 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001395 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001396 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001397 /* Actually, it should neither be interned nor be anything else: */
1398 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001399
1400#ifdef Py_DEBUG
1401 ++unicode_ready_calls;
1402#endif
1403
1404 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001405 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001406 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001407 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001408
1409 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001410 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1411 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001412 PyErr_NoMemory();
1413 return -1;
1414 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001415 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001416 _PyUnicode_WSTR(unicode), end,
1417 PyUnicode_1BYTE_DATA(unicode));
1418 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1419 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1420 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1421 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001422 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001423 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001424 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001425 }
1426 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001427 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001428 _PyUnicode_UTF8(unicode) = NULL;
1429 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001430 }
1431 PyObject_FREE(_PyUnicode_WSTR(unicode));
1432 _PyUnicode_WSTR(unicode) = NULL;
1433 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1434 }
1435 /* In this case we might have to convert down from 4-byte native
1436 wchar_t to 2-byte unicode. */
1437 else if (maxchar < 65536) {
1438 assert(num_surrogates == 0 &&
1439 "FindMaxCharAndNumSurrogatePairs() messed up");
1440
Victor Stinner506f5922011-09-28 22:34:18 +02001441#if SIZEOF_WCHAR_T == 2
1442 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001443 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001444 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1445 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1446 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001447 _PyUnicode_UTF8(unicode) = NULL;
1448 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001449#else
1450 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001451 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001452 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001453 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001454 PyErr_NoMemory();
1455 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001456 }
Victor Stinner506f5922011-09-28 22:34:18 +02001457 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1458 _PyUnicode_WSTR(unicode), end,
1459 PyUnicode_2BYTE_DATA(unicode));
1460 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1461 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1462 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001463 _PyUnicode_UTF8(unicode) = NULL;
1464 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001465 PyObject_FREE(_PyUnicode_WSTR(unicode));
1466 _PyUnicode_WSTR(unicode) = NULL;
1467 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1468#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001469 }
1470 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1471 else {
1472#if SIZEOF_WCHAR_T == 2
1473 /* in case the native representation is 2-bytes, we need to allocate a
1474 new normalized 4-byte version. */
1475 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001476 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1477 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001478 PyErr_NoMemory();
1479 return -1;
1480 }
1481 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1482 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001483 _PyUnicode_UTF8(unicode) = NULL;
1484 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001485 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1486 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001487 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001488 PyObject_FREE(_PyUnicode_WSTR(unicode));
1489 _PyUnicode_WSTR(unicode) = NULL;
1490 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1491#else
1492 assert(num_surrogates == 0);
1493
Victor Stinnerc3c74152011-10-02 20:39:55 +02001494 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001495 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001496 _PyUnicode_UTF8(unicode) = NULL;
1497 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001498 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1499#endif
1500 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1501 }
1502 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001503 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001504 return 0;
1505}
1506
Alexander Belopolsky40018472011-02-26 01:02:56 +00001507static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001508unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001509{
Walter Dörwald16807132007-05-25 13:52:07 +00001510 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001511 case SSTATE_NOT_INTERNED:
1512 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001513
Benjamin Peterson29060642009-01-31 22:14:21 +00001514 case SSTATE_INTERNED_MORTAL:
1515 /* revive dead object temporarily for DelItem */
1516 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001517 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001518 Py_FatalError(
1519 "deletion of interned string failed");
1520 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001521
Benjamin Peterson29060642009-01-31 22:14:21 +00001522 case SSTATE_INTERNED_IMMORTAL:
1523 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001524
Benjamin Peterson29060642009-01-31 22:14:21 +00001525 default:
1526 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001527 }
1528
Victor Stinner03490912011-10-03 23:45:12 +02001529 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001530 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001531 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001532 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001533 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1534 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001535
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001536 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001537}
1538
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001539#ifdef Py_DEBUG
1540static int
1541unicode_is_singleton(PyObject *unicode)
1542{
1543 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1544 if (unicode == unicode_empty)
1545 return 1;
1546 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1547 {
1548 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1549 if (ch < 256 && unicode_latin1[ch] == unicode)
1550 return 1;
1551 }
1552 return 0;
1553}
1554#endif
1555
Alexander Belopolsky40018472011-02-26 01:02:56 +00001556static int
Victor Stinner488fa492011-12-12 00:01:39 +01001557unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001558{
Victor Stinner488fa492011-12-12 00:01:39 +01001559 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001560 if (Py_REFCNT(unicode) != 1)
1561 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001562 if (_PyUnicode_HASH(unicode) != -1)
1563 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001564 if (PyUnicode_CHECK_INTERNED(unicode))
1565 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001566 if (!PyUnicode_CheckExact(unicode))
1567 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001568#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001569 /* singleton refcount is greater than 1 */
1570 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001571#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001572 return 1;
1573}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001574
Victor Stinnerfe226c02011-10-03 03:52:20 +02001575static int
1576unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1577{
1578 PyObject *unicode;
1579 Py_ssize_t old_length;
1580
1581 assert(p_unicode != NULL);
1582 unicode = *p_unicode;
1583
1584 assert(unicode != NULL);
1585 assert(PyUnicode_Check(unicode));
1586 assert(0 <= length);
1587
Victor Stinner910337b2011-10-03 03:20:16 +02001588 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001589 old_length = PyUnicode_WSTR_LENGTH(unicode);
1590 else
1591 old_length = PyUnicode_GET_LENGTH(unicode);
1592 if (old_length == length)
1593 return 0;
1594
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001595 if (length == 0) {
1596 Py_DECREF(*p_unicode);
1597 *p_unicode = unicode_empty;
1598 Py_INCREF(*p_unicode);
1599 return 0;
1600 }
1601
Victor Stinner488fa492011-12-12 00:01:39 +01001602 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001603 PyObject *copy = resize_copy(unicode, length);
1604 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001605 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001606 Py_DECREF(*p_unicode);
1607 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001608 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001609 }
1610
Victor Stinnerfe226c02011-10-03 03:52:20 +02001611 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001612 PyObject *new_unicode = resize_compact(unicode, length);
1613 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001614 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001615 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001616 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001617 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001618 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001619}
1620
Alexander Belopolsky40018472011-02-26 01:02:56 +00001621int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001622PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001623{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001624 PyObject *unicode;
1625 if (p_unicode == NULL) {
1626 PyErr_BadInternalCall();
1627 return -1;
1628 }
1629 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001630 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001631 {
1632 PyErr_BadInternalCall();
1633 return -1;
1634 }
1635 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001636}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001637
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001638static int
Victor Stinner1b487b42012-05-03 12:29:04 +02001639unicode_widen(PyObject **p_unicode, Py_ssize_t length,
1640 unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001641{
1642 PyObject *result;
1643 assert(PyUnicode_IS_READY(*p_unicode));
Victor Stinner1b487b42012-05-03 12:29:04 +02001644 assert(length <= PyUnicode_GET_LENGTH(*p_unicode));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001645 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1646 return 0;
1647 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1648 maxchar);
1649 if (result == NULL)
1650 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +02001651 _PyUnicode_FastCopyCharacters(result, 0, *p_unicode, 0, length);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001652 Py_DECREF(*p_unicode);
1653 *p_unicode = result;
1654 return 0;
1655}
1656
1657static int
1658unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1659 Py_UCS4 ch)
1660{
Victor Stinner15e9ed22012-02-22 13:36:20 +01001661 assert(ch <= MAX_UNICODE);
Victor Stinner1b487b42012-05-03 12:29:04 +02001662 if (unicode_widen(p_unicode, *pos, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001663 return -1;
1664 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1665 PyUnicode_DATA(*p_unicode),
1666 (*pos)++, ch);
1667 return 0;
1668}
1669
Victor Stinnerc5166102012-02-22 13:55:02 +01001670/* Copy a ASCII or latin1 char* string into a Python Unicode string.
1671 Return the length of the input string.
1672
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001673 WARNING: The function doesn't copy the terminating null character and
1674 doesn't check the maximum character (may write a latin1 character in an
1675 ASCII string). */
Victor Stinnerc5166102012-02-22 13:55:02 +01001676static Py_ssize_t
1677unicode_write_cstr(PyObject *unicode, Py_ssize_t index, const char *str)
1678{
1679 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1680 void *data = PyUnicode_DATA(unicode);
1681
1682 switch (kind) {
1683 case PyUnicode_1BYTE_KIND: {
1684 Py_ssize_t len = strlen(str);
1685 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001686 memcpy((char *) data + index, str, len);
Victor Stinnerc5166102012-02-22 13:55:02 +01001687 return len;
1688 }
1689 case PyUnicode_2BYTE_KIND: {
1690 Py_UCS2 *start = (Py_UCS2 *)data + index;
1691 Py_UCS2 *ucs2 = start;
1692 assert(index <= PyUnicode_GET_LENGTH(unicode));
1693
1694 for (; *str; ++ucs2, ++str)
1695 *ucs2 = (Py_UCS2)*str;
1696
1697 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
1698 return ucs2 - start;
1699 }
1700 default: {
1701 Py_UCS4 *start = (Py_UCS4 *)data + index;
1702 Py_UCS4 *ucs4 = start;
1703 assert(kind == PyUnicode_4BYTE_KIND);
1704 assert(index <= PyUnicode_GET_LENGTH(unicode));
1705
1706 for (; *str; ++ucs4, ++str)
1707 *ucs4 = (Py_UCS4)*str;
1708
1709 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
1710 return ucs4 - start;
1711 }
1712 }
1713}
1714
1715
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001716static PyObject*
1717get_latin1_char(unsigned char ch)
1718{
Victor Stinnera464fc12011-10-02 20:39:30 +02001719 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001720 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001721 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001722 if (!unicode)
1723 return NULL;
1724 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001725 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001726 unicode_latin1[ch] = unicode;
1727 }
1728 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001729 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001730}
1731
Alexander Belopolsky40018472011-02-26 01:02:56 +00001732PyObject *
1733PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001734{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001735 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001736 Py_UCS4 maxchar = 0;
1737 Py_ssize_t num_surrogates;
1738
1739 if (u == NULL)
1740 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001741
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001742 /* If the Unicode data is known at construction time, we can apply
1743 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001744
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001745 /* Optimization for empty strings */
1746 if (size == 0 && unicode_empty != NULL) {
1747 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001748 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001749 }
Tim Petersced69f82003-09-16 20:30:58 +00001750
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001751 /* Single character Unicode objects in the Latin-1 range are
1752 shared when using this constructor */
1753 if (size == 1 && *u < 256)
1754 return get_latin1_char((unsigned char)*u);
1755
1756 /* If not empty and not single character, copy the Unicode data
1757 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001758 if (find_maxchar_surrogates(u, u + size,
1759 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001760 return NULL;
1761
Victor Stinner8faf8212011-12-08 22:14:11 +01001762 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001763 if (!unicode)
1764 return NULL;
1765
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001766 switch (PyUnicode_KIND(unicode)) {
1767 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001768 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001769 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1770 break;
1771 case PyUnicode_2BYTE_KIND:
1772#if Py_UNICODE_SIZE == 2
1773 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1774#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001775 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001776 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1777#endif
1778 break;
1779 case PyUnicode_4BYTE_KIND:
1780#if SIZEOF_WCHAR_T == 2
1781 /* This is the only case which has to process surrogates, thus
1782 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001783 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001784#else
1785 assert(num_surrogates == 0);
1786 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1787#endif
1788 break;
1789 default:
1790 assert(0 && "Impossible state");
1791 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001792
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001793 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001794}
1795
Alexander Belopolsky40018472011-02-26 01:02:56 +00001796PyObject *
1797PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001798{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001799 if (size < 0) {
1800 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001801 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001802 return NULL;
1803 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001804 if (u != NULL)
1805 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1806 else
1807 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001808}
1809
Alexander Belopolsky40018472011-02-26 01:02:56 +00001810PyObject *
1811PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001812{
1813 size_t size = strlen(u);
1814 if (size > PY_SSIZE_T_MAX) {
1815 PyErr_SetString(PyExc_OverflowError, "input too long");
1816 return NULL;
1817 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001818 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001819}
1820
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001821PyObject *
1822_PyUnicode_FromId(_Py_Identifier *id)
1823{
1824 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001825 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1826 strlen(id->string),
1827 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001828 if (!id->object)
1829 return NULL;
1830 PyUnicode_InternInPlace(&id->object);
1831 assert(!id->next);
1832 id->next = static_strings;
1833 static_strings = id;
1834 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001835 return id->object;
1836}
1837
1838void
1839_PyUnicode_ClearStaticStrings()
1840{
1841 _Py_Identifier *i;
1842 for (i = static_strings; i; i = i->next) {
1843 Py_DECREF(i->object);
1844 i->object = NULL;
1845 i->next = NULL;
1846 }
1847}
1848
Benjamin Peterson0df54292012-03-26 14:50:32 -04001849/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001850
Victor Stinnerd3f08822012-05-29 12:57:52 +02001851PyObject*
1852_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001853{
Victor Stinnerd3f08822012-05-29 12:57:52 +02001854 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01001855 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001856 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001857#ifdef Py_DEBUG
Victor Stinnere6b2d442011-12-11 21:54:30 +01001858 assert(s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001859#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001860 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001861 }
Victor Stinner785938e2011-12-11 20:09:03 +01001862 unicode = PyUnicode_New(size, 127);
1863 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001864 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001865 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1866 assert(_PyUnicode_CheckConsistency(unicode, 1));
1867 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001868}
1869
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001870static Py_UCS4
1871kind_maxchar_limit(unsigned int kind)
1872{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001873 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001874 case PyUnicode_1BYTE_KIND:
1875 return 0x80;
1876 case PyUnicode_2BYTE_KIND:
1877 return 0x100;
1878 case PyUnicode_4BYTE_KIND:
1879 return 0x10000;
1880 default:
1881 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001882 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001883 }
1884}
1885
Victor Stinnere6abb482012-05-02 01:15:40 +02001886Py_LOCAL_INLINE(Py_UCS4)
1887align_maxchar(Py_UCS4 maxchar)
1888{
1889 if (maxchar <= 127)
1890 return 127;
1891 else if (maxchar <= 255)
1892 return 255;
1893 else if (maxchar <= 65535)
1894 return 65535;
1895 else
1896 return MAX_UNICODE;
1897}
1898
Victor Stinner702c7342011-10-05 13:50:52 +02001899static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001900_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001901{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001902 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001903 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001904
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001905 if (size == 0) {
1906 Py_INCREF(unicode_empty);
1907 return unicode_empty;
1908 }
1909 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001910 if (size == 1)
1911 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001912
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001913 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001914 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001915 if (!res)
1916 return NULL;
1917 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001918 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001919 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001920}
1921
Victor Stinnere57b1c02011-09-28 22:20:48 +02001922static PyObject*
1923_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001924{
1925 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001926 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001927
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001928 if (size == 0) {
1929 Py_INCREF(unicode_empty);
1930 return unicode_empty;
1931 }
1932 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001933 if (size == 1) {
1934 Py_UCS4 ch = u[0];
1935 if (ch < 256)
1936 return get_latin1_char((unsigned char)ch);
1937
1938 res = PyUnicode_New(1, ch);
1939 if (res == NULL)
1940 return NULL;
1941 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1942 assert(_PyUnicode_CheckConsistency(res, 1));
1943 return res;
1944 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001945
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001946 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001947 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001948 if (!res)
1949 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001950 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001951 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001952 else {
1953 _PyUnicode_CONVERT_BYTES(
1954 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1955 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001956 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001957 return res;
1958}
1959
Victor Stinnere57b1c02011-09-28 22:20:48 +02001960static PyObject*
1961_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001962{
1963 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001964 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001965
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001966 if (size == 0) {
1967 Py_INCREF(unicode_empty);
1968 return unicode_empty;
1969 }
1970 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001971 if (size == 1) {
1972 Py_UCS4 ch = u[0];
1973 if (ch < 256)
1974 return get_latin1_char((unsigned char)ch);
1975
1976 res = PyUnicode_New(1, ch);
1977 if (res == NULL)
1978 return NULL;
1979 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1980 assert(_PyUnicode_CheckConsistency(res, 1));
1981 return res;
1982 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001983
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001984 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001985 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001986 if (!res)
1987 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001988 if (max_char < 256)
1989 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1990 PyUnicode_1BYTE_DATA(res));
1991 else if (max_char < 0x10000)
1992 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1993 PyUnicode_2BYTE_DATA(res));
1994 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001995 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001996 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001997 return res;
1998}
1999
2000PyObject*
2001PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2002{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002003 if (size < 0) {
2004 PyErr_SetString(PyExc_ValueError, "size must be positive");
2005 return NULL;
2006 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002007 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002008 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002009 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002010 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002011 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002012 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002013 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002014 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002015 PyErr_SetString(PyExc_SystemError, "invalid kind");
2016 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002017 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002018}
2019
Victor Stinnerece58de2012-04-23 23:36:38 +02002020Py_UCS4
2021_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2022{
2023 enum PyUnicode_Kind kind;
2024 void *startptr, *endptr;
2025
2026 assert(PyUnicode_IS_READY(unicode));
2027 assert(0 <= start);
2028 assert(end <= PyUnicode_GET_LENGTH(unicode));
2029 assert(start <= end);
2030
2031 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2032 return PyUnicode_MAX_CHAR_VALUE(unicode);
2033
2034 if (start == end)
2035 return 127;
2036
Victor Stinner94d558b2012-04-27 22:26:58 +02002037 if (PyUnicode_IS_ASCII(unicode))
2038 return 127;
2039
Victor Stinnerece58de2012-04-23 23:36:38 +02002040 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002041 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002042 endptr = (char *)startptr + end * kind;
2043 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002044 switch(kind) {
2045 case PyUnicode_1BYTE_KIND:
2046 return ucs1lib_find_max_char(startptr, endptr);
2047 case PyUnicode_2BYTE_KIND:
2048 return ucs2lib_find_max_char(startptr, endptr);
2049 case PyUnicode_4BYTE_KIND:
2050 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002051 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002052 assert(0);
2053 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002054 }
2055}
2056
Victor Stinner25a4b292011-10-06 12:31:55 +02002057/* Ensure that a string uses the most efficient storage, if it is not the
2058 case: create a new string with of the right kind. Write NULL into *p_unicode
2059 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002060static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002061unicode_adjust_maxchar(PyObject **p_unicode)
2062{
2063 PyObject *unicode, *copy;
2064 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002065 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002066 unsigned int kind;
2067
2068 assert(p_unicode != NULL);
2069 unicode = *p_unicode;
2070 assert(PyUnicode_IS_READY(unicode));
2071 if (PyUnicode_IS_ASCII(unicode))
2072 return;
2073
2074 len = PyUnicode_GET_LENGTH(unicode);
2075 kind = PyUnicode_KIND(unicode);
2076 if (kind == PyUnicode_1BYTE_KIND) {
2077 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002078 max_char = ucs1lib_find_max_char(u, u + len);
2079 if (max_char >= 128)
2080 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002081 }
2082 else if (kind == PyUnicode_2BYTE_KIND) {
2083 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002084 max_char = ucs2lib_find_max_char(u, u + len);
2085 if (max_char >= 256)
2086 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002087 }
2088 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002089 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002090 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002091 max_char = ucs4lib_find_max_char(u, u + len);
2092 if (max_char >= 0x10000)
2093 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002094 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002095 copy = PyUnicode_New(len, max_char);
Victor Stinnerd3f08822012-05-29 12:57:52 +02002096 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002097 Py_DECREF(unicode);
2098 *p_unicode = copy;
2099}
2100
Victor Stinner034f6cf2011-09-30 02:26:44 +02002101PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002102_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002103{
Victor Stinner87af4f22011-11-21 23:03:47 +01002104 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002105 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002106
Victor Stinner034f6cf2011-09-30 02:26:44 +02002107 if (!PyUnicode_Check(unicode)) {
2108 PyErr_BadInternalCall();
2109 return NULL;
2110 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002111 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002112 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002113
Victor Stinner87af4f22011-11-21 23:03:47 +01002114 length = PyUnicode_GET_LENGTH(unicode);
2115 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002116 if (!copy)
2117 return NULL;
2118 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2119
Victor Stinner87af4f22011-11-21 23:03:47 +01002120 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2121 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002122 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002123 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002124}
2125
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002126
Victor Stinnerbc603d12011-10-02 01:00:40 +02002127/* Widen Unicode objects to larger buffers. Don't write terminating null
2128 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002129
2130void*
2131_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2132{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002133 Py_ssize_t len;
2134 void *result;
2135 unsigned int skind;
2136
Benjamin Petersonbac79492012-01-14 13:34:47 -05002137 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002138 return NULL;
2139
2140 len = PyUnicode_GET_LENGTH(s);
2141 skind = PyUnicode_KIND(s);
2142 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002143 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002144 return NULL;
2145 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002146 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002147 case PyUnicode_2BYTE_KIND:
2148 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2149 if (!result)
2150 return PyErr_NoMemory();
2151 assert(skind == PyUnicode_1BYTE_KIND);
2152 _PyUnicode_CONVERT_BYTES(
2153 Py_UCS1, Py_UCS2,
2154 PyUnicode_1BYTE_DATA(s),
2155 PyUnicode_1BYTE_DATA(s) + len,
2156 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002157 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002158 case PyUnicode_4BYTE_KIND:
2159 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2160 if (!result)
2161 return PyErr_NoMemory();
2162 if (skind == PyUnicode_2BYTE_KIND) {
2163 _PyUnicode_CONVERT_BYTES(
2164 Py_UCS2, Py_UCS4,
2165 PyUnicode_2BYTE_DATA(s),
2166 PyUnicode_2BYTE_DATA(s) + len,
2167 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002168 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002169 else {
2170 assert(skind == PyUnicode_1BYTE_KIND);
2171 _PyUnicode_CONVERT_BYTES(
2172 Py_UCS1, Py_UCS4,
2173 PyUnicode_1BYTE_DATA(s),
2174 PyUnicode_1BYTE_DATA(s) + len,
2175 result);
2176 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002177 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002178 default:
2179 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002180 }
Victor Stinner01698042011-10-04 00:04:26 +02002181 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002182 return NULL;
2183}
2184
2185static Py_UCS4*
2186as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2187 int copy_null)
2188{
2189 int kind;
2190 void *data;
2191 Py_ssize_t len, targetlen;
2192 if (PyUnicode_READY(string) == -1)
2193 return NULL;
2194 kind = PyUnicode_KIND(string);
2195 data = PyUnicode_DATA(string);
2196 len = PyUnicode_GET_LENGTH(string);
2197 targetlen = len;
2198 if (copy_null)
2199 targetlen++;
2200 if (!target) {
2201 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2202 PyErr_NoMemory();
2203 return NULL;
2204 }
2205 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2206 if (!target) {
2207 PyErr_NoMemory();
2208 return NULL;
2209 }
2210 }
2211 else {
2212 if (targetsize < targetlen) {
2213 PyErr_Format(PyExc_SystemError,
2214 "string is longer than the buffer");
2215 if (copy_null && 0 < targetsize)
2216 target[0] = 0;
2217 return NULL;
2218 }
2219 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002220 if (kind == PyUnicode_1BYTE_KIND) {
2221 Py_UCS1 *start = (Py_UCS1 *) data;
2222 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002223 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002224 else if (kind == PyUnicode_2BYTE_KIND) {
2225 Py_UCS2 *start = (Py_UCS2 *) data;
2226 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2227 }
2228 else {
2229 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002230 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002231 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002232 if (copy_null)
2233 target[len] = 0;
2234 return target;
2235}
2236
2237Py_UCS4*
2238PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2239 int copy_null)
2240{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002241 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002242 PyErr_BadInternalCall();
2243 return NULL;
2244 }
2245 return as_ucs4(string, target, targetsize, copy_null);
2246}
2247
2248Py_UCS4*
2249PyUnicode_AsUCS4Copy(PyObject *string)
2250{
2251 return as_ucs4(string, NULL, 0, 1);
2252}
2253
2254#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002255
Alexander Belopolsky40018472011-02-26 01:02:56 +00002256PyObject *
2257PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002258{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002259 if (w == NULL) {
Victor Stinner382955f2011-12-11 21:44:00 +01002260 if (size == 0) {
2261 Py_INCREF(unicode_empty);
2262 return unicode_empty;
2263 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002264 PyErr_BadInternalCall();
2265 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002266 }
2267
Martin v. Löwis790465f2008-04-05 20:41:37 +00002268 if (size == -1) {
2269 size = wcslen(w);
2270 }
2271
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002272 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002273}
2274
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002275#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002276
Walter Dörwald346737f2007-05-31 10:44:43 +00002277static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002278makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2279 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002280{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002281 *fmt++ = '%';
2282 if (width) {
2283 if (zeropad)
2284 *fmt++ = '0';
2285 fmt += sprintf(fmt, "%d", width);
2286 }
2287 if (precision)
2288 fmt += sprintf(fmt, ".%d", precision);
2289 if (longflag)
2290 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002291 else if (longlongflag) {
2292 /* longlongflag should only ever be nonzero on machines with
2293 HAVE_LONG_LONG defined */
2294#ifdef HAVE_LONG_LONG
2295 char *f = PY_FORMAT_LONG_LONG;
2296 while (*f)
2297 *fmt++ = *f++;
2298#else
2299 /* we shouldn't ever get here */
2300 assert(0);
2301 *fmt++ = 'l';
2302#endif
2303 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002304 else if (size_tflag) {
2305 char *f = PY_FORMAT_SIZE_T;
2306 while (*f)
2307 *fmt++ = *f++;
2308 }
2309 *fmt++ = c;
2310 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002311}
2312
Victor Stinner96865452011-03-01 23:44:09 +00002313/* helper for PyUnicode_FromFormatV() */
2314
2315static const char*
2316parse_format_flags(const char *f,
2317 int *p_width, int *p_precision,
2318 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2319{
2320 int width, precision, longflag, longlongflag, size_tflag;
2321
2322 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2323 f++;
2324 width = 0;
2325 while (Py_ISDIGIT((unsigned)*f))
2326 width = (width*10) + *f++ - '0';
2327 precision = 0;
2328 if (*f == '.') {
2329 f++;
2330 while (Py_ISDIGIT((unsigned)*f))
2331 precision = (precision*10) + *f++ - '0';
2332 if (*f == '%') {
2333 /* "%.3%s" => f points to "3" */
2334 f--;
2335 }
2336 }
2337 if (*f == '\0') {
2338 /* bogus format "%.1" => go backward, f points to "1" */
2339 f--;
2340 }
2341 if (p_width != NULL)
2342 *p_width = width;
2343 if (p_precision != NULL)
2344 *p_precision = precision;
2345
2346 /* Handle %ld, %lu, %lld and %llu. */
2347 longflag = 0;
2348 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002349 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002350
2351 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002352 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002353 longflag = 1;
2354 ++f;
2355 }
2356#ifdef HAVE_LONG_LONG
2357 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002358 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002359 longlongflag = 1;
2360 f += 2;
2361 }
2362#endif
2363 }
2364 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002365 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002366 size_tflag = 1;
2367 ++f;
2368 }
2369 if (p_longflag != NULL)
2370 *p_longflag = longflag;
2371 if (p_longlongflag != NULL)
2372 *p_longlongflag = longlongflag;
2373 if (p_size_tflag != NULL)
2374 *p_size_tflag = size_tflag;
2375 return f;
2376}
2377
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002378/* maximum number of characters required for output of %ld. 21 characters
2379 allows for 64-bit integers (in decimal) and an optional sign. */
2380#define MAX_LONG_CHARS 21
2381/* maximum number of characters required for output of %lld.
2382 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2383 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2384#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2385
Walter Dörwaldd2034312007-05-18 16:29:38 +00002386PyObject *
2387PyUnicode_FromFormatV(const char *format, va_list vargs)
2388{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002389 va_list count;
2390 Py_ssize_t callcount = 0;
2391 PyObject **callresults = NULL;
2392 PyObject **callresult = NULL;
2393 Py_ssize_t n = 0;
2394 int width = 0;
2395 int precision = 0;
2396 int zeropad;
2397 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002398 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002399 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002400 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002401 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2402 Py_UCS4 argmaxchar;
2403 Py_ssize_t numbersize = 0;
2404 char *numberresults = NULL;
2405 char *numberresult = NULL;
2406 Py_ssize_t i;
2407 int kind;
2408 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002409
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002410 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002411 /* step 1: count the number of %S/%R/%A/%s format specifications
2412 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2413 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002414 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002415 * also estimate a upper bound for all the number formats in the string,
2416 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002417 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002418 for (f = format; *f; f++) {
2419 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002420 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002421 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2422 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2423 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2424 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002425
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002426 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002427#ifdef HAVE_LONG_LONG
2428 if (longlongflag) {
2429 if (width < MAX_LONG_LONG_CHARS)
2430 width = MAX_LONG_LONG_CHARS;
2431 }
2432 else
2433#endif
2434 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2435 including sign. Decimal takes the most space. This
2436 isn't enough for octal. If a width is specified we
2437 need more (which we allocate later). */
2438 if (width < MAX_LONG_CHARS)
2439 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002440
2441 /* account for the size + '\0' to separate numbers
2442 inside of the numberresults buffer */
2443 numbersize += (width + 1);
2444 }
2445 }
2446 else if ((unsigned char)*f > 127) {
2447 PyErr_Format(PyExc_ValueError,
2448 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2449 "string, got a non-ASCII byte: 0x%02x",
2450 (unsigned char)*f);
2451 return NULL;
2452 }
2453 }
2454 /* step 2: allocate memory for the results of
2455 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2456 if (callcount) {
2457 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2458 if (!callresults) {
2459 PyErr_NoMemory();
2460 return NULL;
2461 }
2462 callresult = callresults;
2463 }
2464 /* step 2.5: allocate memory for the results of formating numbers */
2465 if (numbersize) {
2466 numberresults = PyObject_Malloc(numbersize);
2467 if (!numberresults) {
2468 PyErr_NoMemory();
2469 goto fail;
2470 }
2471 numberresult = numberresults;
2472 }
2473
2474 /* step 3: format numbers and figure out how large a buffer we need */
2475 for (f = format; *f; f++) {
2476 if (*f == '%') {
2477 const char* p;
2478 int longflag;
2479 int longlongflag;
2480 int size_tflag;
2481 int numprinted;
2482
2483 p = f;
2484 zeropad = (f[1] == '0');
2485 f = parse_format_flags(f, &width, &precision,
2486 &longflag, &longlongflag, &size_tflag);
2487 switch (*f) {
2488 case 'c':
2489 {
2490 Py_UCS4 ordinal = va_arg(count, int);
Victor Stinnere6abb482012-05-02 01:15:40 +02002491 maxchar = MAX_MAXCHAR(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002492 n++;
2493 break;
2494 }
2495 case '%':
2496 n++;
2497 break;
2498 case 'i':
2499 case 'd':
2500 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2501 width, precision, *f);
2502 if (longflag)
2503 numprinted = sprintf(numberresult, fmt,
2504 va_arg(count, long));
2505#ifdef HAVE_LONG_LONG
2506 else if (longlongflag)
2507 numprinted = sprintf(numberresult, fmt,
2508 va_arg(count, PY_LONG_LONG));
2509#endif
2510 else if (size_tflag)
2511 numprinted = sprintf(numberresult, fmt,
2512 va_arg(count, Py_ssize_t));
2513 else
2514 numprinted = sprintf(numberresult, fmt,
2515 va_arg(count, int));
2516 n += numprinted;
2517 /* advance by +1 to skip over the '\0' */
2518 numberresult += (numprinted + 1);
2519 assert(*(numberresult - 1) == '\0');
2520 assert(*(numberresult - 2) != '\0');
2521 assert(numprinted >= 0);
2522 assert(numberresult <= numberresults + numbersize);
2523 break;
2524 case 'u':
2525 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2526 width, precision, 'u');
2527 if (longflag)
2528 numprinted = sprintf(numberresult, fmt,
2529 va_arg(count, unsigned long));
2530#ifdef HAVE_LONG_LONG
2531 else if (longlongflag)
2532 numprinted = sprintf(numberresult, fmt,
2533 va_arg(count, unsigned PY_LONG_LONG));
2534#endif
2535 else if (size_tflag)
2536 numprinted = sprintf(numberresult, fmt,
2537 va_arg(count, size_t));
2538 else
2539 numprinted = sprintf(numberresult, fmt,
2540 va_arg(count, unsigned int));
2541 n += numprinted;
2542 numberresult += (numprinted + 1);
2543 assert(*(numberresult - 1) == '\0');
2544 assert(*(numberresult - 2) != '\0');
2545 assert(numprinted >= 0);
2546 assert(numberresult <= numberresults + numbersize);
2547 break;
2548 case 'x':
2549 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2550 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2551 n += numprinted;
2552 numberresult += (numprinted + 1);
2553 assert(*(numberresult - 1) == '\0');
2554 assert(*(numberresult - 2) != '\0');
2555 assert(numprinted >= 0);
2556 assert(numberresult <= numberresults + numbersize);
2557 break;
2558 case 'p':
2559 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2560 /* %p is ill-defined: ensure leading 0x. */
2561 if (numberresult[1] == 'X')
2562 numberresult[1] = 'x';
2563 else if (numberresult[1] != 'x') {
2564 memmove(numberresult + 2, numberresult,
2565 strlen(numberresult) + 1);
2566 numberresult[0] = '0';
2567 numberresult[1] = 'x';
2568 numprinted += 2;
2569 }
2570 n += numprinted;
2571 numberresult += (numprinted + 1);
2572 assert(*(numberresult - 1) == '\0');
2573 assert(*(numberresult - 2) != '\0');
2574 assert(numprinted >= 0);
2575 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002576 break;
2577 case 's':
2578 {
2579 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002580 const char *s = va_arg(count, const char*);
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002581 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002582 if (!str)
2583 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002584 /* since PyUnicode_DecodeUTF8 returns already flexible
2585 unicode objects, there is no need to call ready on them */
2586 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Victor Stinnere6abb482012-05-02 01:15:40 +02002587 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002588 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002589 /* Remember the str and switch to the next slot */
2590 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002591 break;
2592 }
2593 case 'U':
2594 {
2595 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002596 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002597 if (PyUnicode_READY(obj) == -1)
2598 goto fail;
2599 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Victor Stinnere6abb482012-05-02 01:15:40 +02002600 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002601 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002602 break;
2603 }
2604 case 'V':
2605 {
2606 PyObject *obj = va_arg(count, PyObject *);
2607 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002608 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002609 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002610 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002611 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002612 if (PyUnicode_READY(obj) == -1)
2613 goto fail;
2614 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Victor Stinnere6abb482012-05-02 01:15:40 +02002615 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002616 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002617 *callresult++ = NULL;
2618 }
2619 else {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002620 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002621 if (!str_obj)
2622 goto fail;
Benjamin Petersonbac79492012-01-14 13:34:47 -05002623 if (PyUnicode_READY(str_obj) == -1) {
Victor Stinnere1335c72011-10-04 20:53:03 +02002624 Py_DECREF(str_obj);
2625 goto fail;
2626 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002627 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Victor Stinnere6abb482012-05-02 01:15:40 +02002628 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002629 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002630 *callresult++ = str_obj;
2631 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002632 break;
2633 }
2634 case 'S':
2635 {
2636 PyObject *obj = va_arg(count, PyObject *);
2637 PyObject *str;
2638 assert(obj);
2639 str = PyObject_Str(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002640 if (!str)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002641 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002642 if (PyUnicode_READY(str) == -1) {
2643 Py_DECREF(str);
2644 goto fail;
2645 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002646 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Victor Stinnere6abb482012-05-02 01:15:40 +02002647 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002648 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002649 /* Remember the str and switch to the next slot */
2650 *callresult++ = str;
2651 break;
2652 }
2653 case 'R':
2654 {
2655 PyObject *obj = va_arg(count, PyObject *);
2656 PyObject *repr;
2657 assert(obj);
2658 repr = PyObject_Repr(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002659 if (!repr)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002660 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002661 if (PyUnicode_READY(repr) == -1) {
2662 Py_DECREF(repr);
2663 goto fail;
2664 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002665 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Victor Stinnere6abb482012-05-02 01:15:40 +02002666 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002667 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002668 /* Remember the repr and switch to the next slot */
2669 *callresult++ = repr;
2670 break;
2671 }
2672 case 'A':
2673 {
2674 PyObject *obj = va_arg(count, PyObject *);
2675 PyObject *ascii;
2676 assert(obj);
2677 ascii = PyObject_ASCII(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002678 if (!ascii)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002679 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002680 if (PyUnicode_READY(ascii) == -1) {
2681 Py_DECREF(ascii);
2682 goto fail;
2683 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002684 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Victor Stinnere6abb482012-05-02 01:15:40 +02002685 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002686 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002687 /* Remember the repr and switch to the next slot */
2688 *callresult++ = ascii;
2689 break;
2690 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002691 default:
2692 /* if we stumble upon an unknown
2693 formatting code, copy the rest of
2694 the format string to the output
2695 string. (we cannot just skip the
2696 code, since there's no way to know
2697 what's in the argument list) */
2698 n += strlen(p);
2699 goto expand;
2700 }
2701 } else
2702 n++;
2703 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002704 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002705 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002706 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002707 we don't have to resize the string.
2708 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002709 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002710 if (!string)
2711 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002712 kind = PyUnicode_KIND(string);
2713 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002714 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002715 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002716
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002717 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002718 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002719 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002720
2721 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002722 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2723 /* checking for == because the last argument could be a empty
2724 string, which causes i to point to end, the assert at the end of
2725 the loop */
2726 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002727
Benjamin Peterson14339b62009-01-31 16:36:08 +00002728 switch (*f) {
2729 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002730 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002731 const int ordinal = va_arg(vargs, int);
2732 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002733 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002734 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002735 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002736 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002737 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002738 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002739 case 'p':
Victor Stinnerc5166102012-02-22 13:55:02 +01002740 {
2741 Py_ssize_t written;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002742 /* unused, since we already have the result */
2743 if (*f == 'p')
2744 (void) va_arg(vargs, void *);
2745 else
2746 (void) va_arg(vargs, int);
2747 /* extract the result from numberresults and append. */
Victor Stinnerc5166102012-02-22 13:55:02 +01002748 written = unicode_write_cstr(string, i, numberresult);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002749 /* skip over the separating '\0' */
Victor Stinnerc5166102012-02-22 13:55:02 +01002750 i += written;
2751 numberresult += written;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002752 assert(*numberresult == '\0');
2753 numberresult++;
2754 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002755 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002756 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002757 case 's':
2758 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002759 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002760 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002761 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002762 size = PyUnicode_GET_LENGTH(*callresult);
2763 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerd3f08822012-05-29 12:57:52 +02002764 _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002765 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002766 /* We're done with the unicode()/repr() => forget it */
2767 Py_DECREF(*callresult);
2768 /* switch to next unicode()/repr() result */
2769 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002770 break;
2771 }
2772 case 'U':
2773 {
2774 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002775 Py_ssize_t size;
2776 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2777 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerd3f08822012-05-29 12:57:52 +02002778 _PyUnicode_FastCopyCharacters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002779 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002780 break;
2781 }
2782 case 'V':
2783 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002784 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002785 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002786 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002787 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002788 size = PyUnicode_GET_LENGTH(obj);
2789 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerd3f08822012-05-29 12:57:52 +02002790 _PyUnicode_FastCopyCharacters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002791 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002792 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002793 size = PyUnicode_GET_LENGTH(*callresult);
2794 assert(PyUnicode_KIND(*callresult) <=
2795 PyUnicode_KIND(string));
Victor Stinnerd3f08822012-05-29 12:57:52 +02002796 _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002797 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002798 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002799 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002800 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002801 break;
2802 }
2803 case 'S':
2804 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002805 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002806 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002807 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002808 /* unused, since we already have the result */
2809 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002810 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerd3f08822012-05-29 12:57:52 +02002811 _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002812 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002813 /* We're done with the unicode()/repr() => forget it */
2814 Py_DECREF(*callresult);
2815 /* switch to next unicode()/repr() result */
2816 ++callresult;
2817 break;
2818 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002819 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002820 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002821 break;
2822 default:
Victor Stinnerc5166102012-02-22 13:55:02 +01002823 i += unicode_write_cstr(string, i, p);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002824 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002825 goto end;
2826 }
Victor Stinner1205f272010-09-11 00:54:47 +00002827 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002828 else {
2829 assert(i < PyUnicode_GET_LENGTH(string));
2830 PyUnicode_WRITE(kind, data, i++, *f);
2831 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002832 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002833 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002834
Benjamin Peterson29060642009-01-31 22:14:21 +00002835 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002836 if (callresults)
2837 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002838 if (numberresults)
2839 PyObject_Free(numberresults);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002840 return unicode_result(string);
Benjamin Peterson29060642009-01-31 22:14:21 +00002841 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002842 if (callresults) {
2843 PyObject **callresult2 = callresults;
2844 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002845 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002846 ++callresult2;
2847 }
2848 PyObject_Free(callresults);
2849 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002850 if (numberresults)
2851 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002852 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002853}
2854
Walter Dörwaldd2034312007-05-18 16:29:38 +00002855PyObject *
2856PyUnicode_FromFormat(const char *format, ...)
2857{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002858 PyObject* ret;
2859 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002860
2861#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002862 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002863#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002864 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002865#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002866 ret = PyUnicode_FromFormatV(format, vargs);
2867 va_end(vargs);
2868 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002869}
2870
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002871#ifdef HAVE_WCHAR_H
2872
Victor Stinner5593d8a2010-10-02 11:11:27 +00002873/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2874 convert a Unicode object to a wide character string.
2875
Victor Stinnerd88d9832011-09-06 02:00:05 +02002876 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002877 character) required to convert the unicode object. Ignore size argument.
2878
Victor Stinnerd88d9832011-09-06 02:00:05 +02002879 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002880 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002881 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002882static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002883unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002884 wchar_t *w,
2885 Py_ssize_t size)
2886{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002887 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002888 const wchar_t *wstr;
2889
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002890 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002891 if (wstr == NULL)
2892 return -1;
2893
Victor Stinner5593d8a2010-10-02 11:11:27 +00002894 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002895 if (size > res)
2896 size = res + 1;
2897 else
2898 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002899 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002900 return res;
2901 }
2902 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002903 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002904}
2905
2906Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002907PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002908 wchar_t *w,
2909 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002910{
2911 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002912 PyErr_BadInternalCall();
2913 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002914 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002915 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002916}
2917
Victor Stinner137c34c2010-09-29 10:25:54 +00002918wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002919PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002920 Py_ssize_t *size)
2921{
2922 wchar_t* buffer;
2923 Py_ssize_t buflen;
2924
2925 if (unicode == NULL) {
2926 PyErr_BadInternalCall();
2927 return NULL;
2928 }
2929
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002930 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002931 if (buflen == -1)
2932 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002933 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002934 PyErr_NoMemory();
2935 return NULL;
2936 }
2937
Victor Stinner137c34c2010-09-29 10:25:54 +00002938 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2939 if (buffer == NULL) {
2940 PyErr_NoMemory();
2941 return NULL;
2942 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002943 buflen = unicode_aswidechar(unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002944 if (buflen == -1)
2945 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002946 if (size != NULL)
2947 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002948 return buffer;
2949}
2950
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002951#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002952
Alexander Belopolsky40018472011-02-26 01:02:56 +00002953PyObject *
2954PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002955{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002956 PyObject *v;
Victor Stinner8faf8212011-12-08 22:14:11 +01002957 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002958 PyErr_SetString(PyExc_ValueError,
2959 "chr() arg not in range(0x110000)");
2960 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002961 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002962
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002963 if (ordinal < 256)
2964 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002965
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002966 v = PyUnicode_New(1, ordinal);
2967 if (v == NULL)
2968 return NULL;
2969 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002970 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002971 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002972}
2973
Alexander Belopolsky40018472011-02-26 01:02:56 +00002974PyObject *
2975PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002976{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002977 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002978 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002979 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002980 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002981 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002982 Py_INCREF(obj);
2983 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002984 }
2985 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002986 /* For a Unicode subtype that's not a Unicode object,
2987 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002988 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002989 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002990 PyErr_Format(PyExc_TypeError,
2991 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002992 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002993 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002994}
2995
Alexander Belopolsky40018472011-02-26 01:02:56 +00002996PyObject *
2997PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002998 const char *encoding,
2999 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003000{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003001 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003002 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003003
Guido van Rossumd57fd912000-03-10 22:53:23 +00003004 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003005 PyErr_BadInternalCall();
3006 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003007 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003008
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003009 /* Decoding bytes objects is the most common case and should be fast */
3010 if (PyBytes_Check(obj)) {
3011 if (PyBytes_GET_SIZE(obj) == 0) {
3012 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02003013 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003014 }
3015 else {
3016 v = PyUnicode_Decode(
3017 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3018 encoding, errors);
3019 }
3020 return v;
3021 }
3022
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003023 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003024 PyErr_SetString(PyExc_TypeError,
3025 "decoding str is not supported");
3026 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003027 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003028
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003029 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3030 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3031 PyErr_Format(PyExc_TypeError,
3032 "coercing to str: need bytes, bytearray "
3033 "or buffer-like object, %.80s found",
3034 Py_TYPE(obj)->tp_name);
3035 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003036 }
Tim Petersced69f82003-09-16 20:30:58 +00003037
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003038 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003039 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02003040 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003041 }
Tim Petersced69f82003-09-16 20:30:58 +00003042 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003043 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003044
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003045 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003046 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003047}
3048
Victor Stinner600d3be2010-06-10 12:00:55 +00003049/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00003050 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
3051 1 on success. */
3052static int
3053normalize_encoding(const char *encoding,
3054 char *lower,
3055 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003056{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003057 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003058 char *l;
3059 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003060
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04003061 if (encoding == NULL) {
3062 strcpy(lower, "utf-8");
3063 return 1;
3064 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003065 e = encoding;
3066 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003067 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00003068 while (*e) {
3069 if (l == l_end)
3070 return 0;
David Malcolm96960882010-11-05 17:23:41 +00003071 if (Py_ISUPPER(*e)) {
3072 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003073 }
3074 else if (*e == '_') {
3075 *l++ = '-';
3076 e++;
3077 }
3078 else {
3079 *l++ = *e++;
3080 }
3081 }
3082 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003083 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003084}
3085
Alexander Belopolsky40018472011-02-26 01:02:56 +00003086PyObject *
3087PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003088 Py_ssize_t size,
3089 const char *encoding,
3090 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003091{
3092 PyObject *buffer = NULL, *unicode;
3093 Py_buffer info;
3094 char lower[11]; /* Enough for any encoding shortcut */
3095
Fred Drakee4315f52000-05-09 19:53:39 +00003096 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003097 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003098 if ((strcmp(lower, "utf-8") == 0) ||
3099 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003100 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00003101 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003102 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003103 (strcmp(lower, "iso-8859-1") == 0))
3104 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003105#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003106 else if (strcmp(lower, "mbcs") == 0)
3107 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003108#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003109 else if (strcmp(lower, "ascii") == 0)
3110 return PyUnicode_DecodeASCII(s, size, errors);
3111 else if (strcmp(lower, "utf-16") == 0)
3112 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3113 else if (strcmp(lower, "utf-32") == 0)
3114 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3115 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003116
3117 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003118 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003119 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003120 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003121 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003122 if (buffer == NULL)
3123 goto onError;
3124 unicode = PyCodec_Decode(buffer, encoding, errors);
3125 if (unicode == NULL)
3126 goto onError;
3127 if (!PyUnicode_Check(unicode)) {
3128 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003129 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00003130 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003131 Py_DECREF(unicode);
3132 goto onError;
3133 }
3134 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003135 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003136
Benjamin Peterson29060642009-01-31 22:14:21 +00003137 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003138 Py_XDECREF(buffer);
3139 return NULL;
3140}
3141
Alexander Belopolsky40018472011-02-26 01:02:56 +00003142PyObject *
3143PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003144 const char *encoding,
3145 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003146{
3147 PyObject *v;
3148
3149 if (!PyUnicode_Check(unicode)) {
3150 PyErr_BadArgument();
3151 goto onError;
3152 }
3153
3154 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003155 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003156
3157 /* Decode via the codec registry */
3158 v = PyCodec_Decode(unicode, encoding, errors);
3159 if (v == NULL)
3160 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003161 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003162
Benjamin Peterson29060642009-01-31 22:14:21 +00003163 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003164 return NULL;
3165}
3166
Alexander Belopolsky40018472011-02-26 01:02:56 +00003167PyObject *
3168PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003169 const char *encoding,
3170 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003171{
3172 PyObject *v;
3173
3174 if (!PyUnicode_Check(unicode)) {
3175 PyErr_BadArgument();
3176 goto onError;
3177 }
3178
3179 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003180 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003181
3182 /* Decode via the codec registry */
3183 v = PyCodec_Decode(unicode, encoding, errors);
3184 if (v == NULL)
3185 goto onError;
3186 if (!PyUnicode_Check(v)) {
3187 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003188 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003189 Py_TYPE(v)->tp_name);
3190 Py_DECREF(v);
3191 goto onError;
3192 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003193 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003194
Benjamin Peterson29060642009-01-31 22:14:21 +00003195 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003196 return NULL;
3197}
3198
Alexander Belopolsky40018472011-02-26 01:02:56 +00003199PyObject *
3200PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003201 Py_ssize_t size,
3202 const char *encoding,
3203 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003204{
3205 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003206
Guido van Rossumd57fd912000-03-10 22:53:23 +00003207 unicode = PyUnicode_FromUnicode(s, size);
3208 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003209 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003210 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3211 Py_DECREF(unicode);
3212 return v;
3213}
3214
Alexander Belopolsky40018472011-02-26 01:02:56 +00003215PyObject *
3216PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003217 const char *encoding,
3218 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003219{
3220 PyObject *v;
3221
3222 if (!PyUnicode_Check(unicode)) {
3223 PyErr_BadArgument();
3224 goto onError;
3225 }
3226
3227 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003228 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003229
3230 /* Encode via the codec registry */
3231 v = PyCodec_Encode(unicode, encoding, errors);
3232 if (v == NULL)
3233 goto onError;
3234 return v;
3235
Benjamin Peterson29060642009-01-31 22:14:21 +00003236 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003237 return NULL;
3238}
3239
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003240static size_t
3241wcstombs_errorpos(const wchar_t *wstr)
3242{
3243 size_t len;
3244#if SIZEOF_WCHAR_T == 2
3245 wchar_t buf[3];
3246#else
3247 wchar_t buf[2];
3248#endif
3249 char outbuf[MB_LEN_MAX];
3250 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003251
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003252#if SIZEOF_WCHAR_T == 2
3253 buf[2] = 0;
3254#else
3255 buf[1] = 0;
3256#endif
3257 start = wstr;
3258 while (*wstr != L'\0')
3259 {
3260 previous = wstr;
3261#if SIZEOF_WCHAR_T == 2
3262 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3263 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3264 {
3265 buf[0] = wstr[0];
3266 buf[1] = wstr[1];
3267 wstr += 2;
3268 }
3269 else {
3270 buf[0] = *wstr;
3271 buf[1] = 0;
3272 wstr++;
3273 }
3274#else
3275 buf[0] = *wstr;
3276 wstr++;
3277#endif
3278 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003279 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003280 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003281 }
3282
3283 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003284 return 0;
3285}
3286
Victor Stinner1b579672011-12-17 05:47:23 +01003287static int
3288locale_error_handler(const char *errors, int *surrogateescape)
3289{
3290 if (errors == NULL) {
3291 *surrogateescape = 0;
3292 return 0;
3293 }
3294
3295 if (strcmp(errors, "strict") == 0) {
3296 *surrogateescape = 0;
3297 return 0;
3298 }
3299 if (strcmp(errors, "surrogateescape") == 0) {
3300 *surrogateescape = 1;
3301 return 0;
3302 }
3303 PyErr_Format(PyExc_ValueError,
3304 "only 'strict' and 'surrogateescape' error handlers "
3305 "are supported, not '%s'",
3306 errors);
3307 return -1;
3308}
3309
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003310PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003311PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003312{
3313 Py_ssize_t wlen, wlen2;
3314 wchar_t *wstr;
3315 PyObject *bytes = NULL;
3316 char *errmsg;
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003317 PyObject *reason;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003318 PyObject *exc;
3319 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003320 int surrogateescape;
3321
3322 if (locale_error_handler(errors, &surrogateescape) < 0)
3323 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003324
3325 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3326 if (wstr == NULL)
3327 return NULL;
3328
3329 wlen2 = wcslen(wstr);
3330 if (wlen2 != wlen) {
3331 PyMem_Free(wstr);
3332 PyErr_SetString(PyExc_TypeError, "embedded null character");
3333 return NULL;
3334 }
3335
3336 if (surrogateescape) {
3337 /* locale encoding with surrogateescape */
3338 char *str;
3339
3340 str = _Py_wchar2char(wstr, &error_pos);
3341 if (str == NULL) {
3342 if (error_pos == (size_t)-1) {
3343 PyErr_NoMemory();
3344 PyMem_Free(wstr);
3345 return NULL;
3346 }
3347 else {
3348 goto encode_error;
3349 }
3350 }
3351 PyMem_Free(wstr);
3352
3353 bytes = PyBytes_FromString(str);
3354 PyMem_Free(str);
3355 }
3356 else {
3357 size_t len, len2;
3358
3359 len = wcstombs(NULL, wstr, 0);
3360 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003361 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003362 goto encode_error;
3363 }
3364
3365 bytes = PyBytes_FromStringAndSize(NULL, len);
3366 if (bytes == NULL) {
3367 PyMem_Free(wstr);
3368 return NULL;
3369 }
3370
3371 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3372 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003373 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003374 goto encode_error;
3375 }
3376 PyMem_Free(wstr);
3377 }
3378 return bytes;
3379
3380encode_error:
3381 errmsg = strerror(errno);
3382 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003383
3384 if (error_pos == (size_t)-1)
3385 error_pos = wcstombs_errorpos(wstr);
3386
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003387 PyMem_Free(wstr);
3388 Py_XDECREF(bytes);
3389
Victor Stinner2f197072011-12-17 07:08:30 +01003390 if (errmsg != NULL) {
3391 size_t errlen;
3392 wstr = _Py_char2wchar(errmsg, &errlen);
3393 if (wstr != NULL) {
3394 reason = PyUnicode_FromWideChar(wstr, errlen);
3395 PyMem_Free(wstr);
3396 } else
3397 errmsg = NULL;
3398 }
3399 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003400 reason = PyUnicode_FromString(
3401 "wcstombs() encountered an unencodable "
3402 "wide character");
3403 if (reason == NULL)
3404 return NULL;
3405
3406 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3407 "locale", unicode,
3408 (Py_ssize_t)error_pos,
3409 (Py_ssize_t)(error_pos+1),
3410 reason);
3411 Py_DECREF(reason);
3412 if (exc != NULL) {
3413 PyCodec_StrictErrors(exc);
3414 Py_XDECREF(exc);
3415 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003416 return NULL;
3417}
3418
Victor Stinnerad158722010-10-27 00:25:46 +00003419PyObject *
3420PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003421{
Victor Stinner99b95382011-07-04 14:23:54 +02003422#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003423 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003424#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003425 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003426#else
Victor Stinner793b5312011-04-27 00:24:21 +02003427 PyInterpreterState *interp = PyThreadState_GET()->interp;
3428 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3429 cannot use it to encode and decode filenames before it is loaded. Load
3430 the Python codec requires to encode at least its own filename. Use the C
3431 version of the locale codec until the codec registry is initialized and
3432 the Python codec is loaded.
3433
3434 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3435 cannot only rely on it: check also interp->fscodec_initialized for
3436 subinterpreters. */
3437 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003438 return PyUnicode_AsEncodedString(unicode,
3439 Py_FileSystemDefaultEncoding,
3440 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003441 }
3442 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003443 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003444 }
Victor Stinnerad158722010-10-27 00:25:46 +00003445#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003446}
3447
Alexander Belopolsky40018472011-02-26 01:02:56 +00003448PyObject *
3449PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003450 const char *encoding,
3451 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003452{
3453 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003454 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003455
Guido van Rossumd57fd912000-03-10 22:53:23 +00003456 if (!PyUnicode_Check(unicode)) {
3457 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003458 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003459 }
Fred Drakee4315f52000-05-09 19:53:39 +00003460
Fred Drakee4315f52000-05-09 19:53:39 +00003461 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003462 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003463 if ((strcmp(lower, "utf-8") == 0) ||
3464 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003465 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003466 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003467 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003468 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003469 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003470 }
Victor Stinner37296e82010-06-10 13:36:23 +00003471 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003472 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003473 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003474 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003475#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003476 else if (strcmp(lower, "mbcs") == 0)
3477 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003478#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003479 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003480 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003481 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003482
3483 /* Encode via the codec registry */
3484 v = PyCodec_Encode(unicode, encoding, errors);
3485 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003486 return NULL;
3487
3488 /* The normal path */
3489 if (PyBytes_Check(v))
3490 return v;
3491
3492 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003493 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003494 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003495 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003496
3497 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3498 "encoder %s returned bytearray instead of bytes",
3499 encoding);
3500 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003501 Py_DECREF(v);
3502 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003503 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003504
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003505 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3506 Py_DECREF(v);
3507 return b;
3508 }
3509
3510 PyErr_Format(PyExc_TypeError,
3511 "encoder did not return a bytes object (type=%.400s)",
3512 Py_TYPE(v)->tp_name);
3513 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003514 return NULL;
3515}
3516
Alexander Belopolsky40018472011-02-26 01:02:56 +00003517PyObject *
3518PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003519 const char *encoding,
3520 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003521{
3522 PyObject *v;
3523
3524 if (!PyUnicode_Check(unicode)) {
3525 PyErr_BadArgument();
3526 goto onError;
3527 }
3528
3529 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003530 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003531
3532 /* Encode via the codec registry */
3533 v = PyCodec_Encode(unicode, encoding, errors);
3534 if (v == NULL)
3535 goto onError;
3536 if (!PyUnicode_Check(v)) {
3537 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003538 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003539 Py_TYPE(v)->tp_name);
3540 Py_DECREF(v);
3541 goto onError;
3542 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003543 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003544
Benjamin Peterson29060642009-01-31 22:14:21 +00003545 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003546 return NULL;
3547}
3548
Victor Stinner2f197072011-12-17 07:08:30 +01003549static size_t
3550mbstowcs_errorpos(const char *str, size_t len)
3551{
3552#ifdef HAVE_MBRTOWC
3553 const char *start = str;
3554 mbstate_t mbs;
3555 size_t converted;
3556 wchar_t ch;
3557
3558 memset(&mbs, 0, sizeof mbs);
3559 while (len)
3560 {
3561 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3562 if (converted == 0)
3563 /* Reached end of string */
3564 break;
3565 if (converted == (size_t)-1 || converted == (size_t)-2) {
3566 /* Conversion error or incomplete character */
3567 return str - start;
3568 }
3569 else {
3570 str += converted;
3571 len -= converted;
3572 }
3573 }
3574 /* failed to find the undecodable byte sequence */
3575 return 0;
3576#endif
3577 return 0;
3578}
3579
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003580PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003581PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003582 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003583{
3584 wchar_t smallbuf[256];
3585 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3586 wchar_t *wstr;
3587 size_t wlen, wlen2;
3588 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003589 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003590 size_t error_pos;
3591 char *errmsg;
3592 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003593
3594 if (locale_error_handler(errors, &surrogateescape) < 0)
3595 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003596
3597 if (str[len] != '\0' || len != strlen(str)) {
3598 PyErr_SetString(PyExc_TypeError, "embedded null character");
3599 return NULL;
3600 }
3601
3602 if (surrogateescape)
3603 {
3604 wstr = _Py_char2wchar(str, &wlen);
3605 if (wstr == NULL) {
3606 if (wlen == (size_t)-1)
3607 PyErr_NoMemory();
3608 else
3609 PyErr_SetFromErrno(PyExc_OSError);
3610 return NULL;
3611 }
3612
3613 unicode = PyUnicode_FromWideChar(wstr, wlen);
3614 PyMem_Free(wstr);
3615 }
3616 else {
3617#ifndef HAVE_BROKEN_MBSTOWCS
3618 wlen = mbstowcs(NULL, str, 0);
3619#else
3620 wlen = len;
3621#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003622 if (wlen == (size_t)-1)
3623 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003624 if (wlen+1 <= smallbuf_len) {
3625 wstr = smallbuf;
3626 }
3627 else {
3628 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3629 return PyErr_NoMemory();
3630
3631 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3632 if (!wstr)
3633 return PyErr_NoMemory();
3634 }
3635
3636 /* This shouldn't fail now */
3637 wlen2 = mbstowcs(wstr, str, wlen+1);
3638 if (wlen2 == (size_t)-1) {
3639 if (wstr != smallbuf)
3640 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003641 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003642 }
3643#ifdef HAVE_BROKEN_MBSTOWCS
3644 assert(wlen2 == wlen);
3645#endif
3646 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3647 if (wstr != smallbuf)
3648 PyMem_Free(wstr);
3649 }
3650 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003651
3652decode_error:
3653 errmsg = strerror(errno);
3654 assert(errmsg != NULL);
3655
3656 error_pos = mbstowcs_errorpos(str, len);
3657 if (errmsg != NULL) {
3658 size_t errlen;
3659 wstr = _Py_char2wchar(errmsg, &errlen);
3660 if (wstr != NULL) {
3661 reason = PyUnicode_FromWideChar(wstr, errlen);
3662 PyMem_Free(wstr);
3663 } else
3664 errmsg = NULL;
3665 }
3666 if (errmsg == NULL)
3667 reason = PyUnicode_FromString(
3668 "mbstowcs() encountered an invalid multibyte sequence");
3669 if (reason == NULL)
3670 return NULL;
3671
3672 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3673 "locale", str, len,
3674 (Py_ssize_t)error_pos,
3675 (Py_ssize_t)(error_pos+1),
3676 reason);
3677 Py_DECREF(reason);
3678 if (exc != NULL) {
3679 PyCodec_StrictErrors(exc);
3680 Py_XDECREF(exc);
3681 }
3682 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003683}
3684
3685PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003686PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003687{
3688 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003689 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003690}
3691
3692
3693PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003694PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003695 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003696 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3697}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003698
Christian Heimes5894ba72007-11-04 11:43:14 +00003699PyObject*
3700PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3701{
Victor Stinner99b95382011-07-04 14:23:54 +02003702#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003703 return PyUnicode_DecodeMBCS(s, size, NULL);
3704#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003705 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003706#else
Victor Stinner793b5312011-04-27 00:24:21 +02003707 PyInterpreterState *interp = PyThreadState_GET()->interp;
3708 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3709 cannot use it to encode and decode filenames before it is loaded. Load
3710 the Python codec requires to encode at least its own filename. Use the C
3711 version of the locale codec until the codec registry is initialized and
3712 the Python codec is loaded.
3713
3714 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3715 cannot only rely on it: check also interp->fscodec_initialized for
3716 subinterpreters. */
3717 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003718 return PyUnicode_Decode(s, size,
3719 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003720 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003721 }
3722 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003723 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003724 }
Victor Stinnerad158722010-10-27 00:25:46 +00003725#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003726}
3727
Martin v. Löwis011e8422009-05-05 04:43:17 +00003728
3729int
Antoine Pitrou13348842012-01-29 18:36:34 +01003730_PyUnicode_HasNULChars(PyObject* s)
3731{
3732 static PyObject *nul = NULL;
3733
3734 if (nul == NULL)
3735 nul = PyUnicode_FromStringAndSize("\0", 1);
3736 if (nul == NULL)
3737 return -1;
3738 return PyUnicode_Contains(s, nul);
3739}
3740
3741
3742int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003743PyUnicode_FSConverter(PyObject* arg, void* addr)
3744{
3745 PyObject *output = NULL;
3746 Py_ssize_t size;
3747 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003748 if (arg == NULL) {
3749 Py_DECREF(*(PyObject**)addr);
3750 return 1;
3751 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003752 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003753 output = arg;
3754 Py_INCREF(output);
3755 }
3756 else {
3757 arg = PyUnicode_FromObject(arg);
3758 if (!arg)
3759 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003760 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003761 Py_DECREF(arg);
3762 if (!output)
3763 return 0;
3764 if (!PyBytes_Check(output)) {
3765 Py_DECREF(output);
3766 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3767 return 0;
3768 }
3769 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003770 size = PyBytes_GET_SIZE(output);
3771 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003772 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003773 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003774 Py_DECREF(output);
3775 return 0;
3776 }
3777 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003778 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003779}
3780
3781
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003782int
3783PyUnicode_FSDecoder(PyObject* arg, void* addr)
3784{
3785 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003786 if (arg == NULL) {
3787 Py_DECREF(*(PyObject**)addr);
3788 return 1;
3789 }
3790 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003791 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003792 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003793 output = arg;
3794 Py_INCREF(output);
3795 }
3796 else {
3797 arg = PyBytes_FromObject(arg);
3798 if (!arg)
3799 return 0;
3800 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3801 PyBytes_GET_SIZE(arg));
3802 Py_DECREF(arg);
3803 if (!output)
3804 return 0;
3805 if (!PyUnicode_Check(output)) {
3806 Py_DECREF(output);
3807 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3808 return 0;
3809 }
3810 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003811 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003812 Py_DECREF(output);
3813 return 0;
3814 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003815 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003816 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003817 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3818 Py_DECREF(output);
3819 return 0;
3820 }
3821 *(PyObject**)addr = output;
3822 return Py_CLEANUP_SUPPORTED;
3823}
3824
3825
Martin v. Löwis5b222132007-06-10 09:51:05 +00003826char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003827PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003828{
Christian Heimesf3863112007-11-22 07:46:41 +00003829 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003830
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003831 if (!PyUnicode_Check(unicode)) {
3832 PyErr_BadArgument();
3833 return NULL;
3834 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003835 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003836 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003837
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003838 if (PyUnicode_UTF8(unicode) == NULL) {
3839 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003840 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3841 if (bytes == NULL)
3842 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003843 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3844 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003845 Py_DECREF(bytes);
3846 return NULL;
3847 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003848 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3849 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3850 PyBytes_AS_STRING(bytes),
3851 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003852 Py_DECREF(bytes);
3853 }
3854
3855 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003856 *psize = PyUnicode_UTF8_LENGTH(unicode);
3857 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003858}
3859
3860char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003861PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003862{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003863 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3864}
3865
3866#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02003867static int unicode_as_unicode_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003868#endif
3869
3870
3871Py_UNICODE *
3872PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3873{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003874 const unsigned char *one_byte;
3875#if SIZEOF_WCHAR_T == 4
3876 const Py_UCS2 *two_bytes;
3877#else
3878 const Py_UCS4 *four_bytes;
3879 const Py_UCS4 *ucs4_end;
3880 Py_ssize_t num_surrogates;
3881#endif
3882 wchar_t *w;
3883 wchar_t *wchar_end;
3884
3885 if (!PyUnicode_Check(unicode)) {
3886 PyErr_BadArgument();
3887 return NULL;
3888 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003889 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003890 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003891 assert(_PyUnicode_KIND(unicode) != 0);
3892 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003893
3894#ifdef Py_DEBUG
3895 ++unicode_as_unicode_calls;
3896#endif
3897
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003898 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003899#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003900 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3901 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003902 num_surrogates = 0;
3903
3904 for (; four_bytes < ucs4_end; ++four_bytes) {
3905 if (*four_bytes > 0xFFFF)
3906 ++num_surrogates;
3907 }
3908
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003909 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3910 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3911 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003912 PyErr_NoMemory();
3913 return NULL;
3914 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003915 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003916
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003917 w = _PyUnicode_WSTR(unicode);
3918 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3919 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003920 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3921 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003922 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003923 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003924 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3925 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003926 }
3927 else
3928 *w = *four_bytes;
3929
3930 if (w > wchar_end) {
3931 assert(0 && "Miscalculated string end");
3932 }
3933 }
3934 *w = 0;
3935#else
3936 /* sizeof(wchar_t) == 4 */
3937 Py_FatalError("Impossible unicode object state, wstr and str "
3938 "should share memory already.");
3939 return NULL;
3940#endif
3941 }
3942 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003943 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3944 (_PyUnicode_LENGTH(unicode) + 1));
3945 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003946 PyErr_NoMemory();
3947 return NULL;
3948 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003949 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3950 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3951 w = _PyUnicode_WSTR(unicode);
3952 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003953
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003954 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3955 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003956 for (; w < wchar_end; ++one_byte, ++w)
3957 *w = *one_byte;
3958 /* null-terminate the wstr */
3959 *w = 0;
3960 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003961 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003962#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003963 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003964 for (; w < wchar_end; ++two_bytes, ++w)
3965 *w = *two_bytes;
3966 /* null-terminate the wstr */
3967 *w = 0;
3968#else
3969 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003970 PyObject_FREE(_PyUnicode_WSTR(unicode));
3971 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003972 Py_FatalError("Impossible unicode object state, wstr "
3973 "and str should share memory already.");
3974 return NULL;
3975#endif
3976 }
3977 else {
3978 assert(0 && "This should never happen.");
3979 }
3980 }
3981 }
3982 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003983 *size = PyUnicode_WSTR_LENGTH(unicode);
3984 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003985}
3986
Alexander Belopolsky40018472011-02-26 01:02:56 +00003987Py_UNICODE *
3988PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003989{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003990 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003991}
3992
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003993
Alexander Belopolsky40018472011-02-26 01:02:56 +00003994Py_ssize_t
3995PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003996{
3997 if (!PyUnicode_Check(unicode)) {
3998 PyErr_BadArgument();
3999 goto onError;
4000 }
4001 return PyUnicode_GET_SIZE(unicode);
4002
Benjamin Peterson29060642009-01-31 22:14:21 +00004003 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004004 return -1;
4005}
4006
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004007Py_ssize_t
4008PyUnicode_GetLength(PyObject *unicode)
4009{
Victor Stinner5a706cf2011-10-02 00:36:53 +02004010 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004011 PyErr_BadArgument();
4012 return -1;
4013 }
4014
4015 return PyUnicode_GET_LENGTH(unicode);
4016}
4017
4018Py_UCS4
4019PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4020{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004021 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
4022 PyErr_BadArgument();
4023 return (Py_UCS4)-1;
4024 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004025 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004026 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004027 return (Py_UCS4)-1;
4028 }
4029 return PyUnicode_READ_CHAR(unicode, index);
4030}
4031
4032int
4033PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4034{
4035 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004036 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004037 return -1;
4038 }
Victor Stinner488fa492011-12-12 00:01:39 +01004039 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004040 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004041 PyErr_SetString(PyExc_IndexError, "string index out of range");
4042 return -1;
4043 }
Victor Stinner488fa492011-12-12 00:01:39 +01004044 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004045 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004046 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4047 PyErr_SetString(PyExc_ValueError, "character out of range");
4048 return -1;
4049 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004050 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4051 index, ch);
4052 return 0;
4053}
4054
Alexander Belopolsky40018472011-02-26 01:02:56 +00004055const char *
4056PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004057{
Victor Stinner42cb4622010-09-01 19:39:01 +00004058 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004059}
4060
Victor Stinner554f3f02010-06-16 23:33:54 +00004061/* create or adjust a UnicodeDecodeError */
4062static void
4063make_decode_exception(PyObject **exceptionObject,
4064 const char *encoding,
4065 const char *input, Py_ssize_t length,
4066 Py_ssize_t startpos, Py_ssize_t endpos,
4067 const char *reason)
4068{
4069 if (*exceptionObject == NULL) {
4070 *exceptionObject = PyUnicodeDecodeError_Create(
4071 encoding, input, length, startpos, endpos, reason);
4072 }
4073 else {
4074 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4075 goto onError;
4076 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4077 goto onError;
4078 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4079 goto onError;
4080 }
4081 return;
4082
4083onError:
4084 Py_DECREF(*exceptionObject);
4085 *exceptionObject = NULL;
4086}
4087
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004088/* error handling callback helper:
4089 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004090 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004091 and adjust various state variables.
4092 return 0 on success, -1 on error
4093*/
4094
Alexander Belopolsky40018472011-02-26 01:02:56 +00004095static int
4096unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004097 const char *encoding, const char *reason,
4098 const char **input, const char **inend, Py_ssize_t *startinpos,
4099 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004100 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004101{
Benjamin Peterson142957c2008-07-04 19:55:29 +00004102 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004103
4104 PyObject *restuple = NULL;
4105 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004106 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004107 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004108 Py_ssize_t requiredsize;
4109 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004110 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004111 int res = -1;
4112
Victor Stinner596a6c42011-11-09 00:02:18 +01004113 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
4114 outsize = PyUnicode_GET_LENGTH(*output);
4115 else
4116 outsize = _PyUnicode_WSTR_LENGTH(*output);
4117
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004118 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004119 *errorHandler = PyCodec_LookupError(errors);
4120 if (*errorHandler == NULL)
4121 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004122 }
4123
Victor Stinner554f3f02010-06-16 23:33:54 +00004124 make_decode_exception(exceptionObject,
4125 encoding,
4126 *input, *inend - *input,
4127 *startinpos, *endinpos,
4128 reason);
4129 if (*exceptionObject == NULL)
4130 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004131
4132 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4133 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004134 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004135 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004136 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004137 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004138 }
4139 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004140 goto onError;
Benjamin Petersonbac79492012-01-14 13:34:47 -05004141 if (PyUnicode_READY(repunicode) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004142 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004143
4144 /* Copy back the bytes variables, which might have been modified by the
4145 callback */
4146 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4147 if (!inputobj)
4148 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004149 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004150 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004151 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004152 *input = PyBytes_AS_STRING(inputobj);
4153 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004154 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004155 /* we can DECREF safely, as the exception has another reference,
4156 so the object won't go away. */
4157 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004158
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004159 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004160 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004161 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004162 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4163 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004164 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004165
Victor Stinner596a6c42011-11-09 00:02:18 +01004166 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
4167 /* need more space? (at least enough for what we
4168 have+the replacement+the rest of the string (starting
4169 at the new input position), so we won't have to check space
4170 when there are no errors in the rest of the string) */
4171 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
4172 requiredsize = *outpos + replen + insize-newpos;
4173 if (requiredsize > outsize) {
4174 if (requiredsize<2*outsize)
4175 requiredsize = 2*outsize;
4176 if (unicode_resize(output, requiredsize) < 0)
4177 goto onError;
4178 }
Victor Stinner1b487b42012-05-03 12:29:04 +02004179 if (unicode_widen(output, *outpos,
4180 PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004181 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +02004182 _PyUnicode_FastCopyCharacters(*output, *outpos, repunicode, 0, replen);
Victor Stinner596a6c42011-11-09 00:02:18 +01004183 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004184 }
Victor Stinner596a6c42011-11-09 00:02:18 +01004185 else {
4186 wchar_t *repwstr;
4187 Py_ssize_t repwlen;
4188 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4189 if (repwstr == NULL)
4190 goto onError;
4191 /* need more space? (at least enough for what we
4192 have+the replacement+the rest of the string (starting
4193 at the new input position), so we won't have to check space
4194 when there are no errors in the rest of the string) */
4195 requiredsize = *outpos + repwlen + insize-newpos;
4196 if (requiredsize > outsize) {
4197 if (requiredsize < 2*outsize)
4198 requiredsize = 2*outsize;
4199 if (unicode_resize(output, requiredsize) < 0)
4200 goto onError;
4201 }
4202 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4203 *outpos += repwlen;
4204 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004205 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004206 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004207
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004208 /* we made it! */
4209 res = 0;
4210
Benjamin Peterson29060642009-01-31 22:14:21 +00004211 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004212 Py_XDECREF(restuple);
4213 return res;
4214}
4215
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004216/* --- UTF-7 Codec -------------------------------------------------------- */
4217
Antoine Pitrou244651a2009-05-04 18:56:13 +00004218/* See RFC2152 for details. We encode conservatively and decode liberally. */
4219
4220/* Three simple macros defining base-64. */
4221
4222/* Is c a base-64 character? */
4223
4224#define IS_BASE64(c) \
4225 (((c) >= 'A' && (c) <= 'Z') || \
4226 ((c) >= 'a' && (c) <= 'z') || \
4227 ((c) >= '0' && (c) <= '9') || \
4228 (c) == '+' || (c) == '/')
4229
4230/* given that c is a base-64 character, what is its base-64 value? */
4231
4232#define FROM_BASE64(c) \
4233 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4234 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4235 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4236 (c) == '+' ? 62 : 63)
4237
4238/* What is the base-64 character of the bottom 6 bits of n? */
4239
4240#define TO_BASE64(n) \
4241 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4242
4243/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4244 * decoded as itself. We are permissive on decoding; the only ASCII
4245 * byte not decoding to itself is the + which begins a base64
4246 * string. */
4247
4248#define DECODE_DIRECT(c) \
4249 ((c) <= 127 && (c) != '+')
4250
4251/* The UTF-7 encoder treats ASCII characters differently according to
4252 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4253 * the above). See RFC2152. This array identifies these different
4254 * sets:
4255 * 0 : "Set D"
4256 * alphanumeric and '(),-./:?
4257 * 1 : "Set O"
4258 * !"#$%&*;<=>@[]^_`{|}
4259 * 2 : "whitespace"
4260 * ht nl cr sp
4261 * 3 : special (must be base64 encoded)
4262 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4263 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004264
Tim Petersced69f82003-09-16 20:30:58 +00004265static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004266char utf7_category[128] = {
4267/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4268 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4269/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4270 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4271/* sp ! " # $ % & ' ( ) * + , - . / */
4272 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4273/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4274 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4275/* @ A B C D E F G H I J K L M N O */
4276 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4277/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4278 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4279/* ` a b c d e f g h i j k l m n o */
4280 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4281/* p q r s t u v w x y z { | } ~ del */
4282 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004283};
4284
Antoine Pitrou244651a2009-05-04 18:56:13 +00004285/* ENCODE_DIRECT: this character should be encoded as itself. The
4286 * answer depends on whether we are encoding set O as itself, and also
4287 * on whether we are encoding whitespace as itself. RFC2152 makes it
4288 * clear that the answers to these questions vary between
4289 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004290
Antoine Pitrou244651a2009-05-04 18:56:13 +00004291#define ENCODE_DIRECT(c, directO, directWS) \
4292 ((c) < 128 && (c) > 0 && \
4293 ((utf7_category[(c)] == 0) || \
4294 (directWS && (utf7_category[(c)] == 2)) || \
4295 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004296
Alexander Belopolsky40018472011-02-26 01:02:56 +00004297PyObject *
4298PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004299 Py_ssize_t size,
4300 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004301{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004302 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4303}
4304
Antoine Pitrou244651a2009-05-04 18:56:13 +00004305/* The decoder. The only state we preserve is our read position,
4306 * i.e. how many characters we have consumed. So if we end in the
4307 * middle of a shift sequence we have to back off the read position
4308 * and the output to the beginning of the sequence, otherwise we lose
4309 * all the shift state (seen bits, number of bits seen, high
4310 * surrogate). */
4311
Alexander Belopolsky40018472011-02-26 01:02:56 +00004312PyObject *
4313PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004314 Py_ssize_t size,
4315 const char *errors,
4316 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004317{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004318 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004319 Py_ssize_t startinpos;
4320 Py_ssize_t endinpos;
4321 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004322 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004323 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004324 const char *errmsg = "";
4325 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004326 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004327 unsigned int base64bits = 0;
4328 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004329 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004330 PyObject *errorHandler = NULL;
4331 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004332
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004333 /* Start off assuming it's all ASCII. Widen later as necessary. */
4334 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004335 if (!unicode)
4336 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004337 if (size == 0) {
4338 if (consumed)
4339 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004340 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004341 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004342
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004343 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004344 e = s + size;
4345
4346 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004347 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004348 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004349 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004350
Antoine Pitrou244651a2009-05-04 18:56:13 +00004351 if (inShift) { /* in a base-64 section */
4352 if (IS_BASE64(ch)) { /* consume a base-64 character */
4353 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4354 base64bits += 6;
4355 s++;
4356 if (base64bits >= 16) {
4357 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004358 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004359 base64bits -= 16;
4360 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4361 if (surrogate) {
4362 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004363 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4364 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004365 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
4366 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004367 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004368 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004369 }
4370 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004371 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4372 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004373 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004374 }
4375 }
Victor Stinner551ac952011-11-29 22:58:13 +01004376 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004377 /* first surrogate */
4378 surrogate = outCh;
4379 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004380 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004381 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
4382 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004383 }
4384 }
4385 }
4386 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004387 inShift = 0;
4388 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004389 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004390 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4391 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004392 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004393 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004394 if (base64bits > 0) { /* left-over bits */
4395 if (base64bits >= 6) {
4396 /* We've seen at least one base-64 character */
4397 errmsg = "partial character in shift sequence";
4398 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004399 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004400 else {
4401 /* Some bits remain; they should be zero */
4402 if (base64buffer != 0) {
4403 errmsg = "non-zero padding bits in shift sequence";
4404 goto utf7Error;
4405 }
4406 }
4407 }
4408 if (ch != '-') {
4409 /* '-' is absorbed; other terminating
4410 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004411 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4412 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004413 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004414 }
4415 }
4416 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004417 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004418 s++; /* consume '+' */
4419 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004420 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004421 if (unicode_putchar(&unicode, &outpos, '+') < 0)
4422 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004423 }
4424 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004425 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004426 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004427 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004428 }
4429 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004430 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004431 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4432 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004433 s++;
4434 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004435 else {
4436 startinpos = s-starts;
4437 s++;
4438 errmsg = "unexpected special character";
4439 goto utf7Error;
4440 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004441 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004442utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004443 endinpos = s-starts;
4444 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00004445 errors, &errorHandler,
4446 "utf7", errmsg,
4447 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004448 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004449 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004450 }
4451
Antoine Pitrou244651a2009-05-04 18:56:13 +00004452 /* end of string */
4453
4454 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4455 /* if we're in an inconsistent state, that's an error */
4456 if (surrogate ||
4457 (base64bits >= 6) ||
4458 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004459 endinpos = size;
4460 if (unicode_decode_call_errorhandler(
4461 errors, &errorHandler,
4462 "utf7", "unterminated shift sequence",
4463 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004464 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004465 goto onError;
4466 if (s < e)
4467 goto restart;
4468 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004469 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004470
4471 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004472 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004473 if (inShift) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004474 outpos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004475 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004476 }
4477 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004478 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004479 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004480 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004481
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004482 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004483 goto onError;
4484
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004485 Py_XDECREF(errorHandler);
4486 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004487 return unicode_result(unicode);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004488
Benjamin Peterson29060642009-01-31 22:14:21 +00004489 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004490 Py_XDECREF(errorHandler);
4491 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004492 Py_DECREF(unicode);
4493 return NULL;
4494}
4495
4496
Alexander Belopolsky40018472011-02-26 01:02:56 +00004497PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004498_PyUnicode_EncodeUTF7(PyObject *str,
4499 int base64SetO,
4500 int base64WhiteSpace,
4501 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004502{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004503 int kind;
4504 void *data;
4505 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004506 PyObject *v;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004507 Py_ssize_t allocated;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004508 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004509 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004510 unsigned int base64bits = 0;
4511 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004512 char * out;
4513 char * start;
4514
Benjamin Petersonbac79492012-01-14 13:34:47 -05004515 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004516 return NULL;
4517 kind = PyUnicode_KIND(str);
4518 data = PyUnicode_DATA(str);
4519 len = PyUnicode_GET_LENGTH(str);
4520
4521 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004522 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004523
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004524 /* It might be possible to tighten this worst case */
4525 allocated = 8 * len;
4526 if (allocated / 8 != len)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004527 return PyErr_NoMemory();
4528
Antoine Pitrou244651a2009-05-04 18:56:13 +00004529 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004530 if (v == NULL)
4531 return NULL;
4532
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004533 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004534 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004535 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004536
Antoine Pitrou244651a2009-05-04 18:56:13 +00004537 if (inShift) {
4538 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4539 /* shifting out */
4540 if (base64bits) { /* output remaining bits */
4541 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4542 base64buffer = 0;
4543 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004544 }
4545 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004546 /* Characters not in the BASE64 set implicitly unshift the sequence
4547 so no '-' is required, except if the character is itself a '-' */
4548 if (IS_BASE64(ch) || ch == '-') {
4549 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004550 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004551 *out++ = (char) ch;
4552 }
4553 else {
4554 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004555 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004556 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004557 else { /* not in a shift sequence */
4558 if (ch == '+') {
4559 *out++ = '+';
4560 *out++ = '-';
4561 }
4562 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4563 *out++ = (char) ch;
4564 }
4565 else {
4566 *out++ = '+';
4567 inShift = 1;
4568 goto encode_char;
4569 }
4570 }
4571 continue;
4572encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004573 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004574 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004575
Antoine Pitrou244651a2009-05-04 18:56:13 +00004576 /* code first surrogate */
4577 base64bits += 16;
4578 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4579 while (base64bits >= 6) {
4580 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4581 base64bits -= 6;
4582 }
4583 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004584 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004585 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004586 base64bits += 16;
4587 base64buffer = (base64buffer << 16) | ch;
4588 while (base64bits >= 6) {
4589 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4590 base64bits -= 6;
4591 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004592 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004593 if (base64bits)
4594 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4595 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004596 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004597 if (_PyBytes_Resize(&v, out - start) < 0)
4598 return NULL;
4599 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004600}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004601PyObject *
4602PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4603 Py_ssize_t size,
4604 int base64SetO,
4605 int base64WhiteSpace,
4606 const char *errors)
4607{
4608 PyObject *result;
4609 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4610 if (tmp == NULL)
4611 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004612 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004613 base64WhiteSpace, errors);
4614 Py_DECREF(tmp);
4615 return result;
4616}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004617
Antoine Pitrou244651a2009-05-04 18:56:13 +00004618#undef IS_BASE64
4619#undef FROM_BASE64
4620#undef TO_BASE64
4621#undef DECODE_DIRECT
4622#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004623
Guido van Rossumd57fd912000-03-10 22:53:23 +00004624/* --- UTF-8 Codec -------------------------------------------------------- */
4625
Alexander Belopolsky40018472011-02-26 01:02:56 +00004626PyObject *
4627PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004628 Py_ssize_t size,
4629 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004630{
Walter Dörwald69652032004-09-07 20:24:22 +00004631 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4632}
4633
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004634#include "stringlib/asciilib.h"
4635#include "stringlib/codecs.h"
4636#include "stringlib/undef.h"
4637
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004638#include "stringlib/ucs1lib.h"
4639#include "stringlib/codecs.h"
4640#include "stringlib/undef.h"
4641
4642#include "stringlib/ucs2lib.h"
4643#include "stringlib/codecs.h"
4644#include "stringlib/undef.h"
4645
4646#include "stringlib/ucs4lib.h"
4647#include "stringlib/codecs.h"
4648#include "stringlib/undef.h"
4649
Antoine Pitrouab868312009-01-10 15:40:25 +00004650/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4651#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4652
4653/* Mask to quickly check whether a C 'long' contains a
4654 non-ASCII, UTF8-encoded char. */
4655#if (SIZEOF_LONG == 8)
4656# define ASCII_CHAR_MASK 0x8080808080808080L
4657#elif (SIZEOF_LONG == 4)
4658# define ASCII_CHAR_MASK 0x80808080L
4659#else
4660# error C 'long' size should be either 4 or 8!
4661#endif
4662
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004663static Py_ssize_t
4664ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004665{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004666 const char *p = start;
4667 const char *aligned_end = (const char *) ((size_t) end & ~LONG_PTR_MASK);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004668
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004669#if SIZEOF_LONG <= SIZEOF_VOID_P
4670 assert(!((size_t) dest & LONG_PTR_MASK));
4671 if (!((size_t) p & LONG_PTR_MASK)) {
4672 /* Fast path, see in STRINGLIB(utf8_decode) for
4673 an explanation. */
4674 /* Help register allocation */
4675 register const char *_p = p;
4676 register Py_UCS1 * q = dest;
4677 while (_p < aligned_end) {
4678 unsigned long value = *(const unsigned long *) _p;
4679 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004680 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004681 *((unsigned long *)q) = value;
4682 _p += SIZEOF_LONG;
4683 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004684 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004685 p = _p;
4686 while (p < end) {
4687 if ((unsigned char)*p & 0x80)
4688 break;
4689 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004690 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004691 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004692 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004693#endif
4694 while (p < end) {
4695 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4696 for an explanation. */
4697 if (!((size_t) p & LONG_PTR_MASK)) {
4698 /* Help register allocation */
4699 register const char *_p = p;
4700 while (_p < aligned_end) {
4701 unsigned long value = *(unsigned long *) _p;
4702 if (value & ASCII_CHAR_MASK)
4703 break;
4704 _p += SIZEOF_LONG;
4705 }
4706 p = _p;
4707 if (_p == end)
4708 break;
4709 }
4710 if ((unsigned char)*p & 0x80)
4711 break;
4712 ++p;
4713 }
4714 memcpy(dest, start, p - start);
4715 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004716}
Antoine Pitrouab868312009-01-10 15:40:25 +00004717
Victor Stinner785938e2011-12-11 20:09:03 +01004718PyObject *
4719PyUnicode_DecodeUTF8Stateful(const char *s,
4720 Py_ssize_t size,
4721 const char *errors,
4722 Py_ssize_t *consumed)
4723{
Victor Stinner785938e2011-12-11 20:09:03 +01004724 PyObject *unicode;
Victor Stinner785938e2011-12-11 20:09:03 +01004725 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004726 const char *end = s + size;
4727 Py_ssize_t outpos;
4728
4729 Py_ssize_t startinpos;
4730 Py_ssize_t endinpos;
4731 const char *errmsg = "";
4732 PyObject *errorHandler = NULL;
4733 PyObject *exc = NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004734
4735 if (size == 0) {
4736 if (consumed)
4737 *consumed = 0;
Victor Stinner382955f2011-12-11 21:44:00 +01004738 Py_INCREF(unicode_empty);
4739 return unicode_empty;
Victor Stinner785938e2011-12-11 20:09:03 +01004740 }
4741
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004742 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4743 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004744 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004745 *consumed = 1;
4746 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004747 }
4748
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004749 unicode = PyUnicode_New(size, 127);
Victor Stinner785938e2011-12-11 20:09:03 +01004750 if (!unicode)
4751 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004752
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004753 outpos = ascii_decode(s, end, PyUnicode_1BYTE_DATA(unicode));
4754 s += outpos;
4755 while (s < end) {
4756 Py_UCS4 ch;
4757 int kind = PyUnicode_KIND(unicode);
4758 if (kind == PyUnicode_1BYTE_KIND) {
4759 if (PyUnicode_IS_ASCII(unicode))
4760 ch = asciilib_utf8_decode(&s, end,
4761 PyUnicode_1BYTE_DATA(unicode), &outpos);
4762 else
4763 ch = ucs1lib_utf8_decode(&s, end,
4764 PyUnicode_1BYTE_DATA(unicode), &outpos);
4765 } else if (kind == PyUnicode_2BYTE_KIND) {
4766 ch = ucs2lib_utf8_decode(&s, end,
4767 PyUnicode_2BYTE_DATA(unicode), &outpos);
4768 } else {
4769 assert(kind == PyUnicode_4BYTE_KIND);
4770 ch = ucs4lib_utf8_decode(&s, end,
4771 PyUnicode_4BYTE_DATA(unicode), &outpos);
4772 }
4773
4774 switch (ch) {
4775 case 0:
4776 if (s == end || consumed)
4777 goto End;
4778 errmsg = "unexpected end of data";
4779 startinpos = s - starts;
4780 endinpos = startinpos + 1;
4781 while (endinpos < size && (starts[endinpos] & 0xC0) == 0x80)
4782 endinpos++;
4783 break;
4784 case 1:
4785 errmsg = "invalid start byte";
4786 startinpos = s - starts;
4787 endinpos = startinpos + 1;
4788 break;
4789 case 2:
4790 errmsg = "invalid continuation byte";
4791 startinpos = s - starts;
4792 endinpos = startinpos + 1;
4793 while (endinpos < size && (starts[endinpos] & 0xC0) == 0x80)
4794 endinpos++;
4795 break;
4796 default:
4797 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4798 goto onError;
4799 continue;
4800 }
4801
4802 if (unicode_decode_call_errorhandler(
4803 errors, &errorHandler,
4804 "utf-8", errmsg,
4805 &starts, &end, &startinpos, &endinpos, &exc, &s,
4806 &unicode, &outpos))
4807 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004808 }
4809
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004810End:
4811 if (unicode_resize(&unicode, outpos) < 0)
4812 goto onError;
4813
4814 if (consumed)
4815 *consumed = s - starts;
4816
4817 Py_XDECREF(errorHandler);
4818 Py_XDECREF(exc);
4819 assert(_PyUnicode_CheckConsistency(unicode, 1));
4820 return unicode;
4821
4822onError:
4823 Py_XDECREF(errorHandler);
4824 Py_XDECREF(exc);
4825 Py_XDECREF(unicode);
4826 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004827}
4828
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004829#ifdef __APPLE__
4830
4831/* Simplified UTF-8 decoder using surrogateescape error handler,
4832 used to decode the command line arguments on Mac OS X. */
4833
4834wchar_t*
4835_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4836{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004837 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004838 wchar_t *unicode;
4839 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004840
4841 /* Note: size will always be longer than the resulting Unicode
4842 character count */
4843 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4844 PyErr_NoMemory();
4845 return NULL;
4846 }
4847 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4848 if (!unicode)
4849 return NULL;
4850
4851 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004852 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004853 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004854 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004855 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004856#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004857 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004858#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004859 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004860#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004861 if (ch > 0xFF) {
4862#if SIZEOF_WCHAR_T == 4
4863 assert(0);
4864#else
4865 assert(Py_UNICODE_IS_SURROGATE(ch));
4866 /* compute and append the two surrogates: */
4867 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4868 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4869#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004870 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004871 else {
4872 if (!ch && s == e)
4873 break;
4874 /* surrogateescape */
4875 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4876 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004877 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004878 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004879 return unicode;
4880}
4881
4882#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004883
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004884/* Primary internal function which creates utf8 encoded bytes objects.
4885
4886 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004887 and allocate exactly as much space needed at the end. Else allocate the
4888 maximum possible needed (4 result bytes per Unicode character), and return
4889 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004890*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004891PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004892_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004893{
Victor Stinner6099a032011-12-18 14:22:26 +01004894 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004895 void *data;
4896 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004897
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004898 if (!PyUnicode_Check(unicode)) {
4899 PyErr_BadArgument();
4900 return NULL;
4901 }
4902
4903 if (PyUnicode_READY(unicode) == -1)
4904 return NULL;
4905
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004906 if (PyUnicode_UTF8(unicode))
4907 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4908 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004909
4910 kind = PyUnicode_KIND(unicode);
4911 data = PyUnicode_DATA(unicode);
4912 size = PyUnicode_GET_LENGTH(unicode);
4913
Benjamin Petersonead6b532011-12-20 17:23:42 -06004914 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01004915 default:
4916 assert(0);
4917 case PyUnicode_1BYTE_KIND:
4918 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4919 assert(!PyUnicode_IS_ASCII(unicode));
4920 return ucs1lib_utf8_encoder(unicode, data, size, errors);
4921 case PyUnicode_2BYTE_KIND:
4922 return ucs2lib_utf8_encoder(unicode, data, size, errors);
4923 case PyUnicode_4BYTE_KIND:
4924 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00004925 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004926}
4927
Alexander Belopolsky40018472011-02-26 01:02:56 +00004928PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004929PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4930 Py_ssize_t size,
4931 const char *errors)
4932{
4933 PyObject *v, *unicode;
4934
4935 unicode = PyUnicode_FromUnicode(s, size);
4936 if (unicode == NULL)
4937 return NULL;
4938 v = _PyUnicode_AsUTF8String(unicode, errors);
4939 Py_DECREF(unicode);
4940 return v;
4941}
4942
4943PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004944PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004945{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004946 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004947}
4948
Walter Dörwald41980ca2007-08-16 21:55:45 +00004949/* --- UTF-32 Codec ------------------------------------------------------- */
4950
4951PyObject *
4952PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004953 Py_ssize_t size,
4954 const char *errors,
4955 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004956{
4957 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4958}
4959
4960PyObject *
4961PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004962 Py_ssize_t size,
4963 const char *errors,
4964 int *byteorder,
4965 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004966{
4967 const char *starts = s;
4968 Py_ssize_t startinpos;
4969 Py_ssize_t endinpos;
4970 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004971 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004972 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004973 int bo = 0; /* assume native ordering by default */
4974 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004975 /* Offsets from q for retrieving bytes in the right order. */
4976#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4977 int iorder[] = {0, 1, 2, 3};
4978#else
4979 int iorder[] = {3, 2, 1, 0};
4980#endif
4981 PyObject *errorHandler = NULL;
4982 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004983
Walter Dörwald41980ca2007-08-16 21:55:45 +00004984 q = (unsigned char *)s;
4985 e = q + size;
4986
4987 if (byteorder)
4988 bo = *byteorder;
4989
4990 /* Check for BOM marks (U+FEFF) in the input and adjust current
4991 byte order setting accordingly. In native mode, the leading BOM
4992 mark is skipped, in all other modes, it is copied to the output
4993 stream as-is (giving a ZWNBSP character). */
4994 if (bo == 0) {
4995 if (size >= 4) {
4996 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004997 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004998#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004999 if (bom == 0x0000FEFF) {
5000 q += 4;
5001 bo = -1;
5002 }
5003 else if (bom == 0xFFFE0000) {
5004 q += 4;
5005 bo = 1;
5006 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005007#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005008 if (bom == 0x0000FEFF) {
5009 q += 4;
5010 bo = 1;
5011 }
5012 else if (bom == 0xFFFE0000) {
5013 q += 4;
5014 bo = -1;
5015 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005016#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005017 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005018 }
5019
5020 if (bo == -1) {
5021 /* force LE */
5022 iorder[0] = 0;
5023 iorder[1] = 1;
5024 iorder[2] = 2;
5025 iorder[3] = 3;
5026 }
5027 else if (bo == 1) {
5028 /* force BE */
5029 iorder[0] = 3;
5030 iorder[1] = 2;
5031 iorder[2] = 1;
5032 iorder[3] = 0;
5033 }
5034
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005035 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005036 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005037 if (!unicode)
5038 return NULL;
5039 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005040 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005041 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005042
Walter Dörwald41980ca2007-08-16 21:55:45 +00005043 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005044 Py_UCS4 ch;
5045 /* remaining bytes at the end? (size should be divisible by 4) */
5046 if (e-q<4) {
5047 if (consumed)
5048 break;
5049 errmsg = "truncated data";
5050 startinpos = ((const char *)q)-starts;
5051 endinpos = ((const char *)e)-starts;
5052 goto utf32Error;
5053 /* The remaining input chars are ignored if the callback
5054 chooses to skip the input */
5055 }
5056 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5057 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005058
Benjamin Peterson29060642009-01-31 22:14:21 +00005059 if (ch >= 0x110000)
5060 {
5061 errmsg = "codepoint not in range(0x110000)";
5062 startinpos = ((const char *)q)-starts;
5063 endinpos = startinpos+4;
5064 goto utf32Error;
5065 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005066 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5067 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005068 q += 4;
5069 continue;
5070 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005071 if (unicode_decode_call_errorhandler(
5072 errors, &errorHandler,
5073 "utf32", errmsg,
5074 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005075 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005076 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005077 }
5078
5079 if (byteorder)
5080 *byteorder = bo;
5081
5082 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005083 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005084
5085 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005086 if (unicode_resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005087 goto onError;
5088
5089 Py_XDECREF(errorHandler);
5090 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005091 return unicode_result(unicode);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005092
Benjamin Peterson29060642009-01-31 22:14:21 +00005093 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005094 Py_DECREF(unicode);
5095 Py_XDECREF(errorHandler);
5096 Py_XDECREF(exc);
5097 return NULL;
5098}
5099
5100PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005101_PyUnicode_EncodeUTF32(PyObject *str,
5102 const char *errors,
5103 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005104{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005105 int kind;
5106 void *data;
5107 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005108 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005109 unsigned char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005110 Py_ssize_t nsize, bytesize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005111 /* Offsets from p for storing byte pairs in the right order. */
5112#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5113 int iorder[] = {0, 1, 2, 3};
5114#else
5115 int iorder[] = {3, 2, 1, 0};
5116#endif
5117
Benjamin Peterson29060642009-01-31 22:14:21 +00005118#define STORECHAR(CH) \
5119 do { \
5120 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5121 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5122 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5123 p[iorder[0]] = (CH) & 0xff; \
5124 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005125 } while(0)
5126
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005127 if (!PyUnicode_Check(str)) {
5128 PyErr_BadArgument();
5129 return NULL;
5130 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005131 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005132 return NULL;
5133 kind = PyUnicode_KIND(str);
5134 data = PyUnicode_DATA(str);
5135 len = PyUnicode_GET_LENGTH(str);
5136
5137 nsize = len + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005138 bytesize = nsize * 4;
5139 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005140 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005141 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005142 if (v == NULL)
5143 return NULL;
5144
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005145 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005146 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005147 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005148 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005149 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005150
5151 if (byteorder == -1) {
5152 /* force LE */
5153 iorder[0] = 0;
5154 iorder[1] = 1;
5155 iorder[2] = 2;
5156 iorder[3] = 3;
5157 }
5158 else if (byteorder == 1) {
5159 /* force BE */
5160 iorder[0] = 3;
5161 iorder[1] = 2;
5162 iorder[2] = 1;
5163 iorder[3] = 0;
5164 }
5165
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005166 for (i = 0; i < len; i++)
5167 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005168
5169 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005170 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005171#undef STORECHAR
5172}
5173
Alexander Belopolsky40018472011-02-26 01:02:56 +00005174PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005175PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5176 Py_ssize_t size,
5177 const char *errors,
5178 int byteorder)
5179{
5180 PyObject *result;
5181 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5182 if (tmp == NULL)
5183 return NULL;
5184 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5185 Py_DECREF(tmp);
5186 return result;
5187}
5188
5189PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005190PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005191{
Victor Stinnerb960b342011-11-20 19:12:52 +01005192 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005193}
5194
Guido van Rossumd57fd912000-03-10 22:53:23 +00005195/* --- UTF-16 Codec ------------------------------------------------------- */
5196
Tim Peters772747b2001-08-09 22:21:55 +00005197PyObject *
5198PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005199 Py_ssize_t size,
5200 const char *errors,
5201 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005202{
Walter Dörwald69652032004-09-07 20:24:22 +00005203 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5204}
5205
5206PyObject *
5207PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005208 Py_ssize_t size,
5209 const char *errors,
5210 int *byteorder,
5211 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005212{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005213 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005214 Py_ssize_t startinpos;
5215 Py_ssize_t endinpos;
5216 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005217 PyObject *unicode;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005218 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005219 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005220 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005221 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005222 PyObject *errorHandler = NULL;
5223 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005224
Tim Peters772747b2001-08-09 22:21:55 +00005225 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005226 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005227
5228 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005229 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005230
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005231 /* Check for BOM marks (U+FEFF) in the input and adjust current
5232 byte order setting accordingly. In native mode, the leading BOM
5233 mark is skipped, in all other modes, it is copied to the output
5234 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005235 if (bo == 0 && size >= 2) {
5236 const Py_UCS4 bom = (q[1] << 8) | q[0];
5237 if (bom == 0xFEFF) {
5238 q += 2;
5239 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005240 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005241 else if (bom == 0xFFFE) {
5242 q += 2;
5243 bo = 1;
5244 }
5245 if (byteorder)
5246 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005247 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005248
Antoine Pitrou63065d72012-05-15 23:48:04 +02005249 if (q == e) {
5250 if (consumed)
5251 *consumed = size;
5252 Py_INCREF(unicode_empty);
5253 return unicode_empty;
Tim Peters772747b2001-08-09 22:21:55 +00005254 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005255
Antoine Pitrouab868312009-01-10 15:40:25 +00005256#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005257 native_ordering = bo <= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005258#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005259 native_ordering = bo >= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005260#endif
Tim Peters772747b2001-08-09 22:21:55 +00005261
Antoine Pitrou63065d72012-05-15 23:48:04 +02005262 /* Note: size will always be longer than the resulting Unicode
5263 character count */
5264 unicode = PyUnicode_New((e - q + 1) / 2, 127);
5265 if (!unicode)
5266 return NULL;
5267
5268 outpos = 0;
5269 while (1) {
5270 Py_UCS4 ch = 0;
5271 if (e - q >= 2) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005272 int kind = PyUnicode_KIND(unicode);
Antoine Pitrou63065d72012-05-15 23:48:04 +02005273 if (kind == PyUnicode_1BYTE_KIND) {
5274 if (PyUnicode_IS_ASCII(unicode))
5275 ch = asciilib_utf16_decode(&q, e,
5276 PyUnicode_1BYTE_DATA(unicode), &outpos,
5277 native_ordering);
5278 else
5279 ch = ucs1lib_utf16_decode(&q, e,
5280 PyUnicode_1BYTE_DATA(unicode), &outpos,
5281 native_ordering);
5282 } else if (kind == PyUnicode_2BYTE_KIND) {
5283 ch = ucs2lib_utf16_decode(&q, e,
5284 PyUnicode_2BYTE_DATA(unicode), &outpos,
5285 native_ordering);
5286 } else {
5287 assert(kind == PyUnicode_4BYTE_KIND);
5288 ch = ucs4lib_utf16_decode(&q, e,
5289 PyUnicode_4BYTE_DATA(unicode), &outpos,
5290 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005291 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005292 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005293
Antoine Pitrou63065d72012-05-15 23:48:04 +02005294 switch (ch)
5295 {
5296 case 0:
5297 /* remaining byte at the end? (size should be even) */
5298 if (q == e || consumed)
5299 goto End;
5300 errmsg = "truncated data";
5301 startinpos = ((const char *)q) - starts;
5302 endinpos = ((const char *)e) - starts;
5303 break;
5304 /* The remaining input chars are ignored if the callback
5305 chooses to skip the input */
5306 case 1:
5307 errmsg = "unexpected end of data";
5308 startinpos = ((const char *)q) - 2 - starts;
5309 endinpos = ((const char *)e) - starts;
5310 break;
5311 case 2:
5312 errmsg = "illegal encoding";
5313 startinpos = ((const char *)q) - 2 - starts;
5314 endinpos = startinpos + 2;
5315 break;
5316 case 3:
5317 errmsg = "illegal UTF-16 surrogate";
5318 startinpos = ((const char *)q) - 4 - starts;
5319 endinpos = startinpos + 2;
5320 break;
5321 default:
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005322 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5323 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005324 continue;
5325 }
5326
Benjamin Peterson29060642009-01-31 22:14:21 +00005327 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005328 errors,
5329 &errorHandler,
5330 "utf16", errmsg,
5331 &starts,
5332 (const char **)&e,
5333 &startinpos,
5334 &endinpos,
5335 &exc,
5336 (const char **)&q,
5337 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005338 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005339 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005340 }
5341
Antoine Pitrou63065d72012-05-15 23:48:04 +02005342End:
Walter Dörwald69652032004-09-07 20:24:22 +00005343 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005344 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005345
Guido van Rossumd57fd912000-03-10 22:53:23 +00005346 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005347 if (unicode_resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005348 goto onError;
5349
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005350 Py_XDECREF(errorHandler);
5351 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005352 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005353
Benjamin Peterson29060642009-01-31 22:14:21 +00005354 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005355 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005356 Py_XDECREF(errorHandler);
5357 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005358 return NULL;
5359}
5360
Tim Peters772747b2001-08-09 22:21:55 +00005361PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005362_PyUnicode_EncodeUTF16(PyObject *str,
5363 const char *errors,
5364 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005365{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005366 enum PyUnicode_Kind kind;
5367 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005368 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005369 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005370 unsigned short *out;
5371 Py_ssize_t bytesize;
5372 Py_ssize_t pairs;
5373#ifdef WORDS_BIGENDIAN
5374 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005375#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005376 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005377#endif
5378
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005379 if (!PyUnicode_Check(str)) {
5380 PyErr_BadArgument();
5381 return NULL;
5382 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005383 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005384 return NULL;
5385 kind = PyUnicode_KIND(str);
5386 data = PyUnicode_DATA(str);
5387 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005388
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005389 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005390 if (kind == PyUnicode_4BYTE_KIND) {
5391 const Py_UCS4 *in = (const Py_UCS4 *)data;
5392 const Py_UCS4 *end = in + len;
5393 while (in < end)
5394 if (*in++ >= 0x10000)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005395 pairs++;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005396 }
5397 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005398 return PyErr_NoMemory();
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005399 bytesize = (len + pairs + (byteorder == 0)) * 2;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005400 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005401 if (v == NULL)
5402 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005403
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005404 /* output buffer is 2-bytes aligned */
5405 assert(((Py_uintptr_t)PyBytes_AS_STRING(v) & 1) == 0);
5406 out = (unsigned short *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005407 if (byteorder == 0)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005408 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005409 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005410 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005411
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005412 switch (kind) {
5413 case PyUnicode_1BYTE_KIND: {
5414 ucs1lib_utf16_encode(out, (const Py_UCS1 *)data, len, native_ordering);
5415 break;
Tim Peters772747b2001-08-09 22:21:55 +00005416 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005417 case PyUnicode_2BYTE_KIND: {
5418 ucs2lib_utf16_encode(out, (const Py_UCS2 *)data, len, native_ordering);
5419 break;
Tim Peters772747b2001-08-09 22:21:55 +00005420 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005421 case PyUnicode_4BYTE_KIND: {
5422 ucs4lib_utf16_encode(out, (const Py_UCS4 *)data, len, native_ordering);
5423 break;
5424 }
5425 default:
5426 assert(0);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005427 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005428
5429 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005430 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005431}
5432
Alexander Belopolsky40018472011-02-26 01:02:56 +00005433PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005434PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5435 Py_ssize_t size,
5436 const char *errors,
5437 int byteorder)
5438{
5439 PyObject *result;
5440 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5441 if (tmp == NULL)
5442 return NULL;
5443 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5444 Py_DECREF(tmp);
5445 return result;
5446}
5447
5448PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005449PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005450{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005451 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005452}
5453
5454/* --- Unicode Escape Codec ----------------------------------------------- */
5455
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005456/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5457 if all the escapes in the string make it still a valid ASCII string.
5458 Returns -1 if any escapes were found which cause the string to
5459 pop out of ASCII range. Otherwise returns the length of the
5460 required buffer to hold the string.
5461 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005462static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005463length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5464{
5465 const unsigned char *p = (const unsigned char *)s;
5466 const unsigned char *end = p + size;
5467 Py_ssize_t length = 0;
5468
5469 if (size < 0)
5470 return -1;
5471
5472 for (; p < end; ++p) {
5473 if (*p > 127) {
5474 /* Non-ASCII */
5475 return -1;
5476 }
5477 else if (*p != '\\') {
5478 /* Normal character */
5479 ++length;
5480 }
5481 else {
5482 /* Backslash-escape, check next char */
5483 ++p;
5484 /* Escape sequence reaches till end of string or
5485 non-ASCII follow-up. */
5486 if (p >= end || *p > 127)
5487 return -1;
5488 switch (*p) {
5489 case '\n':
5490 /* backslash + \n result in zero characters */
5491 break;
5492 case '\\': case '\'': case '\"':
5493 case 'b': case 'f': case 't':
5494 case 'n': case 'r': case 'v': case 'a':
5495 ++length;
5496 break;
5497 case '0': case '1': case '2': case '3':
5498 case '4': case '5': case '6': case '7':
5499 case 'x': case 'u': case 'U': case 'N':
5500 /* these do not guarantee ASCII characters */
5501 return -1;
5502 default:
5503 /* count the backslash + the other character */
5504 length += 2;
5505 }
5506 }
5507 }
5508 return length;
5509}
5510
Fredrik Lundh06d12682001-01-24 07:59:11 +00005511static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005512
Alexander Belopolsky40018472011-02-26 01:02:56 +00005513PyObject *
5514PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005515 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005516 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005517{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005518 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005519 Py_ssize_t startinpos;
5520 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005521 int j;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005522 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005523 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005524 char* message;
5525 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005526 PyObject *errorHandler = NULL;
5527 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005528 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005529 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005530
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005531 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005532
5533 /* After length_of_escaped_ascii_string() there are two alternatives,
5534 either the string is pure ASCII with named escapes like \n, etc.
5535 and we determined it's exact size (common case)
5536 or it contains \x, \u, ... escape sequences. then we create a
5537 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005538 if (len >= 0) {
5539 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005540 if (!v)
5541 goto onError;
5542 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005543 }
5544 else {
5545 /* Escaped strings will always be longer than the resulting
5546 Unicode string, so we start with size here and then reduce the
5547 length after conversion to the true value.
5548 (but if the error callback returns a long replacement string
5549 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005550 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005551 if (!v)
5552 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005553 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005554 }
5555
Guido van Rossumd57fd912000-03-10 22:53:23 +00005556 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005557 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005558 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005559 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005560
Guido van Rossumd57fd912000-03-10 22:53:23 +00005561 while (s < end) {
5562 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005563 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005564 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005565
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005566 /* The only case in which i == ascii_length is a backslash
5567 followed by a newline. */
5568 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005569
Guido van Rossumd57fd912000-03-10 22:53:23 +00005570 /* Non-escape characters are interpreted as Unicode ordinals */
5571 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005572 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5573 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005574 continue;
5575 }
5576
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005577 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005578 /* \ - Escapes */
5579 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005580 c = *s++;
5581 if (s > end)
5582 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005583
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005584 /* The only case in which i == ascii_length is a backslash
5585 followed by a newline. */
5586 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005587
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005588 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005589
Benjamin Peterson29060642009-01-31 22:14:21 +00005590 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005591#define WRITECHAR(ch) \
5592 do { \
5593 if (unicode_putchar(&v, &i, ch) < 0) \
5594 goto onError; \
5595 }while(0)
5596
Guido van Rossumd57fd912000-03-10 22:53:23 +00005597 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005598 case '\\': WRITECHAR('\\'); break;
5599 case '\'': WRITECHAR('\''); break;
5600 case '\"': WRITECHAR('\"'); break;
5601 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005602 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005603 case 'f': WRITECHAR('\014'); break;
5604 case 't': WRITECHAR('\t'); break;
5605 case 'n': WRITECHAR('\n'); break;
5606 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005607 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005608 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005609 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005610 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005611
Benjamin Peterson29060642009-01-31 22:14:21 +00005612 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005613 case '0': case '1': case '2': case '3':
5614 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005615 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005616 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005617 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005618 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005619 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005620 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005621 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005622 break;
5623
Benjamin Peterson29060642009-01-31 22:14:21 +00005624 /* hex escapes */
5625 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005626 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005627 digits = 2;
5628 message = "truncated \\xXX escape";
5629 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005630
Benjamin Peterson29060642009-01-31 22:14:21 +00005631 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005632 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005633 digits = 4;
5634 message = "truncated \\uXXXX escape";
5635 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005636
Benjamin Peterson29060642009-01-31 22:14:21 +00005637 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005638 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005639 digits = 8;
5640 message = "truncated \\UXXXXXXXX escape";
5641 hexescape:
5642 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005643 if (s+digits>end) {
5644 endinpos = size;
5645 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005646 errors, &errorHandler,
5647 "unicodeescape", "end of string in escape sequence",
5648 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005649 &v, &i))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005650 goto onError;
5651 goto nextByte;
5652 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005653 for (j = 0; j < digits; ++j) {
5654 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005655 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005656 endinpos = (s+j+1)-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005657 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005658 errors, &errorHandler,
5659 "unicodeescape", message,
5660 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005661 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005662 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005663 len = PyUnicode_GET_LENGTH(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005664 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005665 }
5666 chr = (chr<<4) & ~0xF;
5667 if (c >= '0' && c <= '9')
5668 chr += c - '0';
5669 else if (c >= 'a' && c <= 'f')
5670 chr += 10 + c - 'a';
5671 else
5672 chr += 10 + c - 'A';
5673 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005674 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005675 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005676 /* _decoding_error will have already written into the
5677 target buffer. */
5678 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005679 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005680 /* when we get here, chr is a 32-bit unicode character */
Victor Stinner8faf8212011-12-08 22:14:11 +01005681 if (chr <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005682 WRITECHAR(chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005683 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005684 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005685 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005686 errors, &errorHandler,
5687 "unicodeescape", "illegal Unicode character",
5688 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005689 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005690 goto onError;
5691 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005692 break;
5693
Benjamin Peterson29060642009-01-31 22:14:21 +00005694 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005695 case 'N':
5696 message = "malformed \\N character escape";
5697 if (ucnhash_CAPI == NULL) {
5698 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005699 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5700 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005701 if (ucnhash_CAPI == NULL)
5702 goto ucnhashError;
5703 }
5704 if (*s == '{') {
5705 const char *start = s+1;
5706 /* look for the closing brace */
5707 while (*s != '}' && s < end)
5708 s++;
5709 if (s > start && s < end && *s == '}') {
5710 /* found a name. look it up in the unicode database */
5711 message = "unknown Unicode character name";
5712 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005713 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005714 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005715 goto store;
5716 }
5717 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005718 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005719 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005720 errors, &errorHandler,
5721 "unicodeescape", message,
5722 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005723 &v, &i))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005724 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005725 break;
5726
5727 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005728 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005729 message = "\\ at end of string";
5730 s--;
5731 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005732 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005733 errors, &errorHandler,
5734 "unicodeescape", message,
5735 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005736 &v, &i))
Walter Dörwald8c077222002-03-25 11:16:18 +00005737 goto onError;
5738 }
5739 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005740 WRITECHAR('\\');
5741 WRITECHAR(s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005742 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005743 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005744 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005745 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005746 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005747 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005748#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005749
Victor Stinner16e6a802011-12-12 13:24:15 +01005750 if (unicode_resize(&v, i) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005751 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005752 Py_XDECREF(errorHandler);
5753 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005754 return unicode_result(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005755
Benjamin Peterson29060642009-01-31 22:14:21 +00005756 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005757 PyErr_SetString(
5758 PyExc_UnicodeError,
5759 "\\N escapes not supported (can't load unicodedata module)"
5760 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005761 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005762 Py_XDECREF(errorHandler);
5763 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005764 return NULL;
5765
Benjamin Peterson29060642009-01-31 22:14:21 +00005766 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005767 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005768 Py_XDECREF(errorHandler);
5769 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005770 return NULL;
5771}
5772
5773/* Return a Unicode-Escape string version of the Unicode object.
5774
5775 If quotes is true, the string is enclosed in u"" or u'' quotes as
5776 appropriate.
5777
5778*/
5779
Alexander Belopolsky40018472011-02-26 01:02:56 +00005780PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005781PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005782{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005783 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005784 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005785 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005786 int kind;
5787 void *data;
5788 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005789
Thomas Wouters89f507f2006-12-13 04:49:30 +00005790 /* Initial allocation is based on the longest-possible unichr
5791 escape.
5792
5793 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5794 unichr, so in this case it's the longest unichr escape. In
5795 narrow (UTF-16) builds this is five chars per source unichr
5796 since there are two unichrs in the surrogate pair, so in narrow
5797 (UTF-16) builds it's not the longest unichr escape.
5798
5799 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5800 so in the narrow (UTF-16) build case it's the longest unichr
5801 escape.
5802 */
5803
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005804 if (!PyUnicode_Check(unicode)) {
5805 PyErr_BadArgument();
5806 return NULL;
5807 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005808 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005809 return NULL;
5810 len = PyUnicode_GET_LENGTH(unicode);
5811 kind = PyUnicode_KIND(unicode);
5812 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06005813 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005814 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5815 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5816 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5817 }
5818
5819 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005820 return PyBytes_FromStringAndSize(NULL, 0);
5821
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005822 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005823 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005824
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005825 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005826 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005827 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005828 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005829 if (repr == NULL)
5830 return NULL;
5831
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005832 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005833
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005834 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005835 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005836
Walter Dörwald79e913e2007-05-12 11:08:06 +00005837 /* Escape backslashes */
5838 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005839 *p++ = '\\';
5840 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005841 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005842 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005843
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005844 /* Map 21-bit characters to '\U00xxxxxx' */
5845 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005846 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005847 *p++ = '\\';
5848 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005849 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5850 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5851 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5852 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5853 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5854 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5855 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5856 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005857 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005858 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005859
Guido van Rossumd57fd912000-03-10 22:53:23 +00005860 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005861 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005862 *p++ = '\\';
5863 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005864 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5865 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5866 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5867 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005868 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005869
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005870 /* Map special whitespace to '\t', \n', '\r' */
5871 else if (ch == '\t') {
5872 *p++ = '\\';
5873 *p++ = 't';
5874 }
5875 else if (ch == '\n') {
5876 *p++ = '\\';
5877 *p++ = 'n';
5878 }
5879 else if (ch == '\r') {
5880 *p++ = '\\';
5881 *p++ = 'r';
5882 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005883
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005884 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005885 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005886 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005887 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005888 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5889 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005890 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005891
Guido van Rossumd57fd912000-03-10 22:53:23 +00005892 /* Copy everything else as-is */
5893 else
5894 *p++ = (char) ch;
5895 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005896
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005897 assert(p - PyBytes_AS_STRING(repr) > 0);
5898 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5899 return NULL;
5900 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005901}
5902
Alexander Belopolsky40018472011-02-26 01:02:56 +00005903PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005904PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5905 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005906{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005907 PyObject *result;
5908 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5909 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005910 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005911 result = PyUnicode_AsUnicodeEscapeString(tmp);
5912 Py_DECREF(tmp);
5913 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005914}
5915
5916/* --- Raw Unicode Escape Codec ------------------------------------------- */
5917
Alexander Belopolsky40018472011-02-26 01:02:56 +00005918PyObject *
5919PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005920 Py_ssize_t size,
5921 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005922{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005923 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005924 Py_ssize_t startinpos;
5925 Py_ssize_t endinpos;
5926 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005927 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005928 const char *end;
5929 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005930 PyObject *errorHandler = NULL;
5931 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005932
Guido van Rossumd57fd912000-03-10 22:53:23 +00005933 /* Escaped strings will always be longer than the resulting
5934 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005935 length after conversion to the true value. (But decoding error
5936 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005937 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005938 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005939 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005940 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005941 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005942 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005943 end = s + size;
5944 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005945 unsigned char c;
5946 Py_UCS4 x;
5947 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005948 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005949
Benjamin Peterson29060642009-01-31 22:14:21 +00005950 /* Non-escape characters are interpreted as Unicode ordinals */
5951 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005952 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
5953 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005954 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005955 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005956 startinpos = s-starts;
5957
5958 /* \u-escapes are only interpreted iff the number of leading
5959 backslashes if odd */
5960 bs = s;
5961 for (;s < end;) {
5962 if (*s != '\\')
5963 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005964 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
5965 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005966 }
5967 if (((s - bs) & 1) == 0 ||
5968 s >= end ||
5969 (*s != 'u' && *s != 'U')) {
5970 continue;
5971 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005972 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00005973 count = *s=='u' ? 4 : 8;
5974 s++;
5975
5976 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00005977 for (x = 0, i = 0; i < count; ++i, ++s) {
5978 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005979 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005980 endinpos = s-starts;
5981 if (unicode_decode_call_errorhandler(
5982 errors, &errorHandler,
5983 "rawunicodeescape", "truncated \\uXXXX",
5984 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005985 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005986 goto onError;
5987 goto nextByte;
5988 }
5989 x = (x<<4) & ~0xF;
5990 if (c >= '0' && c <= '9')
5991 x += c - '0';
5992 else if (c >= 'a' && c <= 'f')
5993 x += 10 + c - 'a';
5994 else
5995 x += 10 + c - 'A';
5996 }
Victor Stinner8faf8212011-12-08 22:14:11 +01005997 if (x <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005998 if (unicode_putchar(&v, &outpos, x) < 0)
5999 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006000 } else {
6001 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006002 if (unicode_decode_call_errorhandler(
6003 errors, &errorHandler,
6004 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006005 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006006 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006007 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006008 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006009 nextByte:
6010 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006011 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006012 if (unicode_resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006013 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006014 Py_XDECREF(errorHandler);
6015 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006016 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00006017
Benjamin Peterson29060642009-01-31 22:14:21 +00006018 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006019 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006020 Py_XDECREF(errorHandler);
6021 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006022 return NULL;
6023}
6024
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006025
Alexander Belopolsky40018472011-02-26 01:02:56 +00006026PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006027PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006028{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006029 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006030 char *p;
6031 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006032 Py_ssize_t expandsize, pos;
6033 int kind;
6034 void *data;
6035 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006036
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006037 if (!PyUnicode_Check(unicode)) {
6038 PyErr_BadArgument();
6039 return NULL;
6040 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006041 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006042 return NULL;
6043 kind = PyUnicode_KIND(unicode);
6044 data = PyUnicode_DATA(unicode);
6045 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006046 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6047 bytes, and 1 byte characters 4. */
6048 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006049
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006050 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006051 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006052
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006053 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006054 if (repr == NULL)
6055 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006056 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006057 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006058
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006059 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006060 for (pos = 0; pos < len; pos++) {
6061 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006062 /* Map 32-bit characters to '\Uxxxxxxxx' */
6063 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006064 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006065 *p++ = '\\';
6066 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006067 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6068 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6069 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6070 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6071 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6072 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6073 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6074 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006075 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006076 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006077 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006078 *p++ = '\\';
6079 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006080 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6081 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6082 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6083 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006084 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006085 /* Copy everything else as-is */
6086 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006087 *p++ = (char) ch;
6088 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006089
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006090 assert(p > q);
6091 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006092 return NULL;
6093 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006094}
6095
Alexander Belopolsky40018472011-02-26 01:02:56 +00006096PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006097PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6098 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006099{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006100 PyObject *result;
6101 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6102 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006103 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006104 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6105 Py_DECREF(tmp);
6106 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006107}
6108
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006109/* --- Unicode Internal Codec ------------------------------------------- */
6110
Alexander Belopolsky40018472011-02-26 01:02:56 +00006111PyObject *
6112_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006113 Py_ssize_t size,
6114 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006115{
6116 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006117 Py_ssize_t startinpos;
6118 Py_ssize_t endinpos;
6119 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006120 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006121 const char *end;
6122 const char *reason;
6123 PyObject *errorHandler = NULL;
6124 PyObject *exc = NULL;
6125
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006126 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006127 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006128 1))
6129 return NULL;
6130
Thomas Wouters89f507f2006-12-13 04:49:30 +00006131 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006132 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006133 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006134 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006135 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006136 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006137 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006138 end = s + size;
6139
6140 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006141 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006142 Py_UCS4 ch;
6143 /* We copy the raw representation one byte at a time because the
6144 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006145 ((char *) &uch)[0] = s[0];
6146 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006147#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006148 ((char *) &uch)[2] = s[2];
6149 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006150#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006151 ch = uch;
6152
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006153 /* We have to sanity check the raw data, otherwise doom looms for
6154 some malformed UCS-4 data. */
6155 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006156#ifdef Py_UNICODE_WIDE
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006157 ch > 0x10ffff ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006158#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006159 end-s < Py_UNICODE_SIZE
6160 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006161 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006162 startinpos = s - starts;
6163 if (end-s < Py_UNICODE_SIZE) {
6164 endinpos = end-starts;
6165 reason = "truncated input";
6166 }
6167 else {
6168 endinpos = s - starts + Py_UNICODE_SIZE;
6169 reason = "illegal code point (> 0x10FFFF)";
6170 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006171 if (unicode_decode_call_errorhandler(
6172 errors, &errorHandler,
6173 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006174 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006175 &v, &outpos))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006176 goto onError;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006177 continue;
6178 }
6179
6180 s += Py_UNICODE_SIZE;
6181#ifndef Py_UNICODE_WIDE
Victor Stinner551ac952011-11-29 22:58:13 +01006182 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006183 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006184 Py_UNICODE uch2;
6185 ((char *) &uch2)[0] = s[0];
6186 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006187 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006188 {
Victor Stinner551ac952011-11-29 22:58:13 +01006189 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006190 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006191 }
6192 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006193#endif
6194
6195 if (unicode_putchar(&v, &outpos, ch) < 0)
6196 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006197 }
6198
Victor Stinner16e6a802011-12-12 13:24:15 +01006199 if (unicode_resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006200 goto onError;
6201 Py_XDECREF(errorHandler);
6202 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006203 return unicode_result(v);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006204
Benjamin Peterson29060642009-01-31 22:14:21 +00006205 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006206 Py_XDECREF(v);
6207 Py_XDECREF(errorHandler);
6208 Py_XDECREF(exc);
6209 return NULL;
6210}
6211
Guido van Rossumd57fd912000-03-10 22:53:23 +00006212/* --- Latin-1 Codec ------------------------------------------------------ */
6213
Alexander Belopolsky40018472011-02-26 01:02:56 +00006214PyObject *
6215PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006216 Py_ssize_t size,
6217 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006218{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006219 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006220 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006221}
6222
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006223/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006224static void
6225make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006226 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006227 PyObject *unicode,
6228 Py_ssize_t startpos, Py_ssize_t endpos,
6229 const char *reason)
6230{
6231 if (*exceptionObject == NULL) {
6232 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006233 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006234 encoding, unicode, startpos, endpos, reason);
6235 }
6236 else {
6237 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6238 goto onError;
6239 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6240 goto onError;
6241 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6242 goto onError;
6243 return;
6244 onError:
6245 Py_DECREF(*exceptionObject);
6246 *exceptionObject = NULL;
6247 }
6248}
6249
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006250/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006251static void
6252raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006253 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006254 PyObject *unicode,
6255 Py_ssize_t startpos, Py_ssize_t endpos,
6256 const char *reason)
6257{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006258 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006259 encoding, unicode, startpos, endpos, reason);
6260 if (*exceptionObject != NULL)
6261 PyCodec_StrictErrors(*exceptionObject);
6262}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006263
6264/* error handling callback helper:
6265 build arguments, call the callback and check the arguments,
6266 put the result into newpos and return the replacement string, which
6267 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006268static PyObject *
6269unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006270 PyObject **errorHandler,
6271 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006272 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006273 Py_ssize_t startpos, Py_ssize_t endpos,
6274 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006275{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006276 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006277 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006278 PyObject *restuple;
6279 PyObject *resunicode;
6280
6281 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006282 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006283 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006284 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006285 }
6286
Benjamin Petersonbac79492012-01-14 13:34:47 -05006287 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006288 return NULL;
6289 len = PyUnicode_GET_LENGTH(unicode);
6290
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006291 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006292 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006293 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006294 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006295
6296 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006297 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006298 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006299 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006300 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006301 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006302 Py_DECREF(restuple);
6303 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006304 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006305 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006306 &resunicode, newpos)) {
6307 Py_DECREF(restuple);
6308 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006309 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006310 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6311 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6312 Py_DECREF(restuple);
6313 return NULL;
6314 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006315 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006316 *newpos = len + *newpos;
6317 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006318 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6319 Py_DECREF(restuple);
6320 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006321 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006322 Py_INCREF(resunicode);
6323 Py_DECREF(restuple);
6324 return resunicode;
6325}
6326
Alexander Belopolsky40018472011-02-26 01:02:56 +00006327static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006328unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006329 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006330 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006331{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006332 /* input state */
6333 Py_ssize_t pos=0, size;
6334 int kind;
6335 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006336 /* output object */
6337 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006338 /* pointer into the output */
6339 char *str;
6340 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006341 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006342 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6343 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006344 PyObject *errorHandler = NULL;
6345 PyObject *exc = NULL;
6346 /* the following variable is used for caching string comparisons
6347 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6348 int known_errorHandler = -1;
6349
Benjamin Petersonbac79492012-01-14 13:34:47 -05006350 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006351 return NULL;
6352 size = PyUnicode_GET_LENGTH(unicode);
6353 kind = PyUnicode_KIND(unicode);
6354 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006355 /* allocate enough for a simple encoding without
6356 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006357 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006358 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006359 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006360 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006361 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006362 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006363 ressize = size;
6364
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006365 while (pos < size) {
6366 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006367
Benjamin Peterson29060642009-01-31 22:14:21 +00006368 /* can we encode this? */
6369 if (c<limit) {
6370 /* no overflow check, because we know that the space is enough */
6371 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006372 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006373 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006374 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006375 Py_ssize_t requiredsize;
6376 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006377 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006378 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006379 Py_ssize_t collstart = pos;
6380 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006381 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006382 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006383 ++collend;
6384 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6385 if (known_errorHandler==-1) {
6386 if ((errors==NULL) || (!strcmp(errors, "strict")))
6387 known_errorHandler = 1;
6388 else if (!strcmp(errors, "replace"))
6389 known_errorHandler = 2;
6390 else if (!strcmp(errors, "ignore"))
6391 known_errorHandler = 3;
6392 else if (!strcmp(errors, "xmlcharrefreplace"))
6393 known_errorHandler = 4;
6394 else
6395 known_errorHandler = 0;
6396 }
6397 switch (known_errorHandler) {
6398 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006399 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006400 goto onError;
6401 case 2: /* replace */
6402 while (collstart++<collend)
6403 *str++ = '?'; /* fall through */
6404 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006405 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006406 break;
6407 case 4: /* xmlcharrefreplace */
6408 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006409 /* determine replacement size */
6410 for (i = collstart, repsize = 0; i < collend; ++i) {
6411 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6412 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006413 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006414 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006415 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006416 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006417 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006418 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006419 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006420 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006421 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006422 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006423 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006424 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006425 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006426 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006427 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006428 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006429 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006430 if (requiredsize > ressize) {
6431 if (requiredsize<2*ressize)
6432 requiredsize = 2*ressize;
6433 if (_PyBytes_Resize(&res, requiredsize))
6434 goto onError;
6435 str = PyBytes_AS_STRING(res) + respos;
6436 ressize = requiredsize;
6437 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006438 /* generate replacement */
6439 for (i = collstart; i < collend; ++i) {
6440 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006441 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006442 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006443 break;
6444 default:
6445 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006446 encoding, reason, unicode, &exc,
6447 collstart, collend, &newpos);
6448 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006449 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006450 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006451 if (PyBytes_Check(repunicode)) {
6452 /* Directly copy bytes result to output. */
6453 repsize = PyBytes_Size(repunicode);
6454 if (repsize > 1) {
6455 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006456 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006457 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6458 Py_DECREF(repunicode);
6459 goto onError;
6460 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006461 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006462 ressize += repsize-1;
6463 }
6464 memcpy(str, PyBytes_AsString(repunicode), repsize);
6465 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006466 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006467 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006468 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006469 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006470 /* need more space? (at least enough for what we
6471 have+the replacement+the rest of the string, so
6472 we won't have to check space for encodable characters) */
6473 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006474 repsize = PyUnicode_GET_LENGTH(repunicode);
6475 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006476 if (requiredsize > ressize) {
6477 if (requiredsize<2*ressize)
6478 requiredsize = 2*ressize;
6479 if (_PyBytes_Resize(&res, requiredsize)) {
6480 Py_DECREF(repunicode);
6481 goto onError;
6482 }
6483 str = PyBytes_AS_STRING(res) + respos;
6484 ressize = requiredsize;
6485 }
6486 /* check if there is anything unencodable in the replacement
6487 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006488 for (i = 0; repsize-->0; ++i, ++str) {
6489 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006490 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006491 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006492 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006493 Py_DECREF(repunicode);
6494 goto onError;
6495 }
6496 *str = (char)c;
6497 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006498 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006499 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006500 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006501 }
6502 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006503 /* Resize if we allocated to much */
6504 size = str - PyBytes_AS_STRING(res);
6505 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006506 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006507 if (_PyBytes_Resize(&res, size) < 0)
6508 goto onError;
6509 }
6510
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006511 Py_XDECREF(errorHandler);
6512 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006513 return res;
6514
6515 onError:
6516 Py_XDECREF(res);
6517 Py_XDECREF(errorHandler);
6518 Py_XDECREF(exc);
6519 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006520}
6521
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006522/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006523PyObject *
6524PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006525 Py_ssize_t size,
6526 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006527{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006528 PyObject *result;
6529 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6530 if (unicode == NULL)
6531 return NULL;
6532 result = unicode_encode_ucs1(unicode, errors, 256);
6533 Py_DECREF(unicode);
6534 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006535}
6536
Alexander Belopolsky40018472011-02-26 01:02:56 +00006537PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006538_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006539{
6540 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006541 PyErr_BadArgument();
6542 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006543 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006544 if (PyUnicode_READY(unicode) == -1)
6545 return NULL;
6546 /* Fast path: if it is a one-byte string, construct
6547 bytes object directly. */
6548 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6549 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6550 PyUnicode_GET_LENGTH(unicode));
6551 /* Non-Latin-1 characters present. Defer to above function to
6552 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006553 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006554}
6555
6556PyObject*
6557PyUnicode_AsLatin1String(PyObject *unicode)
6558{
6559 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006560}
6561
6562/* --- 7-bit ASCII Codec -------------------------------------------------- */
6563
Alexander Belopolsky40018472011-02-26 01:02:56 +00006564PyObject *
6565PyUnicode_DecodeASCII(const char *s,
6566 Py_ssize_t size,
6567 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006568{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006569 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006570 PyObject *unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006571 int kind;
6572 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006573 Py_ssize_t startinpos;
6574 Py_ssize_t endinpos;
6575 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006576 const char *e;
6577 PyObject *errorHandler = NULL;
6578 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006579
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006580 if (size == 0) {
6581 Py_INCREF(unicode_empty);
6582 return unicode_empty;
6583 }
6584
Guido van Rossumd57fd912000-03-10 22:53:23 +00006585 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006586 if (size == 1 && (unsigned char)s[0] < 128)
6587 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006588
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006589 unicode = PyUnicode_New(size, 127);
6590 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006591 goto onError;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006592
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006593 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006594 data = PyUnicode_1BYTE_DATA(unicode);
6595 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
6596 if (outpos == size)
6597 return unicode;
6598
6599 s += outpos;
6600 kind = PyUnicode_1BYTE_KIND;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006601 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006602 register unsigned char c = (unsigned char)*s;
6603 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006604 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006605 ++s;
6606 }
6607 else {
6608 startinpos = s-starts;
6609 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006610 if (unicode_decode_call_errorhandler(
6611 errors, &errorHandler,
6612 "ascii", "ordinal not in range(128)",
6613 &starts, &e, &startinpos, &endinpos, &exc, &s,
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006614 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006615 goto onError;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006616 kind = PyUnicode_KIND(unicode);
6617 data = PyUnicode_DATA(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00006618 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006619 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006620 if (unicode_resize(&unicode, outpos) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006621 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006622 Py_XDECREF(errorHandler);
6623 Py_XDECREF(exc);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006624 assert(_PyUnicode_CheckConsistency(unicode, 1));
6625 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00006626
Benjamin Peterson29060642009-01-31 22:14:21 +00006627 onError:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006628 Py_XDECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006629 Py_XDECREF(errorHandler);
6630 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006631 return NULL;
6632}
6633
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006634/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006635PyObject *
6636PyUnicode_EncodeASCII(const Py_UNICODE *p,
6637 Py_ssize_t size,
6638 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006639{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006640 PyObject *result;
6641 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6642 if (unicode == NULL)
6643 return NULL;
6644 result = unicode_encode_ucs1(unicode, errors, 128);
6645 Py_DECREF(unicode);
6646 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006647}
6648
Alexander Belopolsky40018472011-02-26 01:02:56 +00006649PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006650_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006651{
6652 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006653 PyErr_BadArgument();
6654 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006655 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006656 if (PyUnicode_READY(unicode) == -1)
6657 return NULL;
6658 /* Fast path: if it is an ASCII-only string, construct bytes object
6659 directly. Else defer to above function to raise the exception. */
6660 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6661 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6662 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006663 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006664}
6665
6666PyObject *
6667PyUnicode_AsASCIIString(PyObject *unicode)
6668{
6669 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006670}
6671
Victor Stinner99b95382011-07-04 14:23:54 +02006672#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006673
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006674/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006675
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006676#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006677#define NEED_RETRY
6678#endif
6679
Victor Stinner3a50e702011-10-18 21:21:00 +02006680#ifndef WC_ERR_INVALID_CHARS
6681# define WC_ERR_INVALID_CHARS 0x0080
6682#endif
6683
6684static char*
6685code_page_name(UINT code_page, PyObject **obj)
6686{
6687 *obj = NULL;
6688 if (code_page == CP_ACP)
6689 return "mbcs";
6690 if (code_page == CP_UTF7)
6691 return "CP_UTF7";
6692 if (code_page == CP_UTF8)
6693 return "CP_UTF8";
6694
6695 *obj = PyBytes_FromFormat("cp%u", code_page);
6696 if (*obj == NULL)
6697 return NULL;
6698 return PyBytes_AS_STRING(*obj);
6699}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006700
Alexander Belopolsky40018472011-02-26 01:02:56 +00006701static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006702is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006703{
6704 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006705 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006706
Victor Stinner3a50e702011-10-18 21:21:00 +02006707 if (!IsDBCSLeadByteEx(code_page, *curr))
6708 return 0;
6709
6710 prev = CharPrevExA(code_page, s, curr, 0);
6711 if (prev == curr)
6712 return 1;
6713 /* FIXME: This code is limited to "true" double-byte encodings,
6714 as it assumes an incomplete character consists of a single
6715 byte. */
6716 if (curr - prev == 2)
6717 return 1;
6718 if (!IsDBCSLeadByteEx(code_page, *prev))
6719 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006720 return 0;
6721}
6722
Victor Stinner3a50e702011-10-18 21:21:00 +02006723static DWORD
6724decode_code_page_flags(UINT code_page)
6725{
6726 if (code_page == CP_UTF7) {
6727 /* The CP_UTF7 decoder only supports flags=0 */
6728 return 0;
6729 }
6730 else
6731 return MB_ERR_INVALID_CHARS;
6732}
6733
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006734/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006735 * Decode a byte string from a Windows code page into unicode object in strict
6736 * mode.
6737 *
6738 * Returns consumed size if succeed, returns -2 on decode error, or raise a
6739 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006740 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006741static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006742decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006743 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006744 const char *in,
6745 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006746{
Victor Stinner3a50e702011-10-18 21:21:00 +02006747 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006748 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006749 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006750
6751 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006752 assert(insize > 0);
6753 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6754 if (outsize <= 0)
6755 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006756
6757 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006758 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01006759 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006760 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006761 if (*v == NULL)
6762 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006763 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006764 }
6765 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006766 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006767 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01006768 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006769 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006770 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006771 }
6772
6773 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006774 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6775 if (outsize <= 0)
6776 goto error;
6777 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006778
Victor Stinner3a50e702011-10-18 21:21:00 +02006779error:
6780 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6781 return -2;
6782 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006783 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006784}
6785
Victor Stinner3a50e702011-10-18 21:21:00 +02006786/*
6787 * Decode a byte string from a code page into unicode object with an error
6788 * handler.
6789 *
6790 * Returns consumed size if succeed, or raise a WindowsError or
6791 * UnicodeDecodeError exception and returns -1 on error.
6792 */
6793static int
6794decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006795 PyObject **v,
6796 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006797 const char *errors)
6798{
6799 const char *startin = in;
6800 const char *endin = in + size;
6801 const DWORD flags = decode_code_page_flags(code_page);
6802 /* Ideally, we should get reason from FormatMessage. This is the Windows
6803 2000 English version of the message. */
6804 const char *reason = "No mapping for the Unicode character exists "
6805 "in the target code page.";
6806 /* each step cannot decode more than 1 character, but a character can be
6807 represented as a surrogate pair */
6808 wchar_t buffer[2], *startout, *out;
6809 int insize, outsize;
6810 PyObject *errorHandler = NULL;
6811 PyObject *exc = NULL;
6812 PyObject *encoding_obj = NULL;
6813 char *encoding;
6814 DWORD err;
6815 int ret = -1;
6816
6817 assert(size > 0);
6818
6819 encoding = code_page_name(code_page, &encoding_obj);
6820 if (encoding == NULL)
6821 return -1;
6822
6823 if (errors == NULL || strcmp(errors, "strict") == 0) {
6824 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6825 UnicodeDecodeError. */
6826 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6827 if (exc != NULL) {
6828 PyCodec_StrictErrors(exc);
6829 Py_CLEAR(exc);
6830 }
6831 goto error;
6832 }
6833
6834 if (*v == NULL) {
6835 /* Create unicode object */
6836 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6837 PyErr_NoMemory();
6838 goto error;
6839 }
Victor Stinnerab595942011-12-17 04:59:06 +01006840 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006841 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006842 if (*v == NULL)
6843 goto error;
6844 startout = PyUnicode_AS_UNICODE(*v);
6845 }
6846 else {
6847 /* Extend unicode object */
6848 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6849 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6850 PyErr_NoMemory();
6851 goto error;
6852 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006853 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006854 goto error;
6855 startout = PyUnicode_AS_UNICODE(*v) + n;
6856 }
6857
6858 /* Decode the byte string character per character */
6859 out = startout;
6860 while (in < endin)
6861 {
6862 /* Decode a character */
6863 insize = 1;
6864 do
6865 {
6866 outsize = MultiByteToWideChar(code_page, flags,
6867 in, insize,
6868 buffer, Py_ARRAY_LENGTH(buffer));
6869 if (outsize > 0)
6870 break;
6871 err = GetLastError();
6872 if (err != ERROR_NO_UNICODE_TRANSLATION
6873 && err != ERROR_INSUFFICIENT_BUFFER)
6874 {
6875 PyErr_SetFromWindowsErr(0);
6876 goto error;
6877 }
6878 insize++;
6879 }
6880 /* 4=maximum length of a UTF-8 sequence */
6881 while (insize <= 4 && (in + insize) <= endin);
6882
6883 if (outsize <= 0) {
6884 Py_ssize_t startinpos, endinpos, outpos;
6885
6886 startinpos = in - startin;
6887 endinpos = startinpos + 1;
6888 outpos = out - PyUnicode_AS_UNICODE(*v);
6889 if (unicode_decode_call_errorhandler(
6890 errors, &errorHandler,
6891 encoding, reason,
6892 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01006893 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02006894 {
6895 goto error;
6896 }
Victor Stinner596a6c42011-11-09 00:02:18 +01006897 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02006898 }
6899 else {
6900 in += insize;
6901 memcpy(out, buffer, outsize * sizeof(wchar_t));
6902 out += outsize;
6903 }
6904 }
6905
6906 /* write a NUL character at the end */
6907 *out = 0;
6908
6909 /* Extend unicode object */
6910 outsize = out - startout;
6911 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01006912 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006913 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01006914 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02006915
6916error:
6917 Py_XDECREF(encoding_obj);
6918 Py_XDECREF(errorHandler);
6919 Py_XDECREF(exc);
6920 return ret;
6921}
6922
Victor Stinner3a50e702011-10-18 21:21:00 +02006923static PyObject *
6924decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006925 const char *s, Py_ssize_t size,
6926 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006927{
Victor Stinner76a31a62011-11-04 00:05:13 +01006928 PyObject *v = NULL;
6929 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006930
Victor Stinner3a50e702011-10-18 21:21:00 +02006931 if (code_page < 0) {
6932 PyErr_SetString(PyExc_ValueError, "invalid code page number");
6933 return NULL;
6934 }
6935
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006936 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006937 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006938
Victor Stinner76a31a62011-11-04 00:05:13 +01006939 do
6940 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006941#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01006942 if (size > INT_MAX) {
6943 chunk_size = INT_MAX;
6944 final = 0;
6945 done = 0;
6946 }
6947 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006948#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01006949 {
6950 chunk_size = (int)size;
6951 final = (consumed == NULL);
6952 done = 1;
6953 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006954
Victor Stinner76a31a62011-11-04 00:05:13 +01006955 /* Skip trailing lead-byte unless 'final' is set */
6956 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
6957 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006958
Victor Stinner76a31a62011-11-04 00:05:13 +01006959 if (chunk_size == 0 && done) {
6960 if (v != NULL)
6961 break;
6962 Py_INCREF(unicode_empty);
6963 return unicode_empty;
6964 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006965
Victor Stinner76a31a62011-11-04 00:05:13 +01006966
6967 converted = decode_code_page_strict(code_page, &v,
6968 s, chunk_size);
6969 if (converted == -2)
6970 converted = decode_code_page_errors(code_page, &v,
6971 s, chunk_size,
6972 errors);
6973 assert(converted != 0);
6974
6975 if (converted < 0) {
6976 Py_XDECREF(v);
6977 return NULL;
6978 }
6979
6980 if (consumed)
6981 *consumed += converted;
6982
6983 s += converted;
6984 size -= converted;
6985 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02006986
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006987 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006988}
6989
Alexander Belopolsky40018472011-02-26 01:02:56 +00006990PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02006991PyUnicode_DecodeCodePageStateful(int code_page,
6992 const char *s,
6993 Py_ssize_t size,
6994 const char *errors,
6995 Py_ssize_t *consumed)
6996{
6997 return decode_code_page_stateful(code_page, s, size, errors, consumed);
6998}
6999
7000PyObject *
7001PyUnicode_DecodeMBCSStateful(const char *s,
7002 Py_ssize_t size,
7003 const char *errors,
7004 Py_ssize_t *consumed)
7005{
7006 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7007}
7008
7009PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007010PyUnicode_DecodeMBCS(const char *s,
7011 Py_ssize_t size,
7012 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007013{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007014 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7015}
7016
Victor Stinner3a50e702011-10-18 21:21:00 +02007017static DWORD
7018encode_code_page_flags(UINT code_page, const char *errors)
7019{
7020 if (code_page == CP_UTF8) {
7021 if (winver.dwMajorVersion >= 6)
7022 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7023 and later */
7024 return WC_ERR_INVALID_CHARS;
7025 else
7026 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7027 return 0;
7028 }
7029 else if (code_page == CP_UTF7) {
7030 /* CP_UTF7 only supports flags=0 */
7031 return 0;
7032 }
7033 else {
7034 if (errors != NULL && strcmp(errors, "replace") == 0)
7035 return 0;
7036 else
7037 return WC_NO_BEST_FIT_CHARS;
7038 }
7039}
7040
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007041/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007042 * Encode a Unicode string to a Windows code page into a byte string in strict
7043 * mode.
7044 *
7045 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7046 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007047 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007048static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007049encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007050 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007051 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007052{
Victor Stinner554f3f02010-06-16 23:33:54 +00007053 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007054 BOOL *pusedDefaultChar = &usedDefaultChar;
7055 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007056 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007057 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007058 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007059 const DWORD flags = encode_code_page_flags(code_page, NULL);
7060 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007061 /* Create a substring so that we can get the UTF-16 representation
7062 of just the slice under consideration. */
7063 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007064
Martin v. Löwis3d325192011-11-04 18:23:06 +01007065 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007066
Victor Stinner3a50e702011-10-18 21:21:00 +02007067 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007068 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007069 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007070 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007071
Victor Stinner2fc507f2011-11-04 20:06:39 +01007072 substring = PyUnicode_Substring(unicode, offset, offset+len);
7073 if (substring == NULL)
7074 return -1;
7075 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7076 if (p == NULL) {
7077 Py_DECREF(substring);
7078 return -1;
7079 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01007080
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007081 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007082 outsize = WideCharToMultiByte(code_page, flags,
7083 p, size,
7084 NULL, 0,
7085 NULL, pusedDefaultChar);
7086 if (outsize <= 0)
7087 goto error;
7088 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007089 if (pusedDefaultChar && *pusedDefaultChar) {
7090 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007091 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007092 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007093
Victor Stinner3a50e702011-10-18 21:21:00 +02007094 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007095 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007096 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007097 if (*outbytes == NULL) {
7098 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007099 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007100 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007101 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007102 }
7103 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007104 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007105 const Py_ssize_t n = PyBytes_Size(*outbytes);
7106 if (outsize > PY_SSIZE_T_MAX - n) {
7107 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007108 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007109 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007110 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007111 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7112 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007113 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007114 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007115 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007116 }
7117
7118 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007119 outsize = WideCharToMultiByte(code_page, flags,
7120 p, size,
7121 out, outsize,
7122 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007123 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007124 if (outsize <= 0)
7125 goto error;
7126 if (pusedDefaultChar && *pusedDefaultChar)
7127 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007128 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007129
Victor Stinner3a50e702011-10-18 21:21:00 +02007130error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007131 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007132 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7133 return -2;
7134 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007135 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007136}
7137
Victor Stinner3a50e702011-10-18 21:21:00 +02007138/*
7139 * Encode a Unicode string to a Windows code page into a byte string using a
7140 * error handler.
7141 *
7142 * Returns consumed characters if succeed, or raise a WindowsError and returns
7143 * -1 on other error.
7144 */
7145static int
7146encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007147 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007148 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007149{
Victor Stinner3a50e702011-10-18 21:21:00 +02007150 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007151 Py_ssize_t pos = unicode_offset;
7152 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007153 /* Ideally, we should get reason from FormatMessage. This is the Windows
7154 2000 English version of the message. */
7155 const char *reason = "invalid character";
7156 /* 4=maximum length of a UTF-8 sequence */
7157 char buffer[4];
7158 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7159 Py_ssize_t outsize;
7160 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007161 PyObject *errorHandler = NULL;
7162 PyObject *exc = NULL;
7163 PyObject *encoding_obj = NULL;
7164 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007165 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007166 PyObject *rep;
7167 int ret = -1;
7168
7169 assert(insize > 0);
7170
7171 encoding = code_page_name(code_page, &encoding_obj);
7172 if (encoding == NULL)
7173 return -1;
7174
7175 if (errors == NULL || strcmp(errors, "strict") == 0) {
7176 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7177 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007178 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007179 if (exc != NULL) {
7180 PyCodec_StrictErrors(exc);
7181 Py_DECREF(exc);
7182 }
7183 Py_XDECREF(encoding_obj);
7184 return -1;
7185 }
7186
7187 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7188 pusedDefaultChar = &usedDefaultChar;
7189 else
7190 pusedDefaultChar = NULL;
7191
7192 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7193 PyErr_NoMemory();
7194 goto error;
7195 }
7196 outsize = insize * Py_ARRAY_LENGTH(buffer);
7197
7198 if (*outbytes == NULL) {
7199 /* Create string object */
7200 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7201 if (*outbytes == NULL)
7202 goto error;
7203 out = PyBytes_AS_STRING(*outbytes);
7204 }
7205 else {
7206 /* Extend string object */
7207 Py_ssize_t n = PyBytes_Size(*outbytes);
7208 if (n > PY_SSIZE_T_MAX - outsize) {
7209 PyErr_NoMemory();
7210 goto error;
7211 }
7212 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7213 goto error;
7214 out = PyBytes_AS_STRING(*outbytes) + n;
7215 }
7216
7217 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007218 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007219 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007220 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7221 wchar_t chars[2];
7222 int charsize;
7223 if (ch < 0x10000) {
7224 chars[0] = (wchar_t)ch;
7225 charsize = 1;
7226 }
7227 else {
7228 ch -= 0x10000;
7229 chars[0] = 0xd800 + (ch >> 10);
7230 chars[1] = 0xdc00 + (ch & 0x3ff);
7231 charsize = 2;
7232 }
7233
Victor Stinner3a50e702011-10-18 21:21:00 +02007234 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007235 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007236 buffer, Py_ARRAY_LENGTH(buffer),
7237 NULL, pusedDefaultChar);
7238 if (outsize > 0) {
7239 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7240 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007241 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007242 memcpy(out, buffer, outsize);
7243 out += outsize;
7244 continue;
7245 }
7246 }
7247 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7248 PyErr_SetFromWindowsErr(0);
7249 goto error;
7250 }
7251
Victor Stinner3a50e702011-10-18 21:21:00 +02007252 rep = unicode_encode_call_errorhandler(
7253 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007254 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007255 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007256 if (rep == NULL)
7257 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007258 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007259
7260 if (PyBytes_Check(rep)) {
7261 outsize = PyBytes_GET_SIZE(rep);
7262 if (outsize != 1) {
7263 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7264 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7265 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7266 Py_DECREF(rep);
7267 goto error;
7268 }
7269 out = PyBytes_AS_STRING(*outbytes) + offset;
7270 }
7271 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7272 out += outsize;
7273 }
7274 else {
7275 Py_ssize_t i;
7276 enum PyUnicode_Kind kind;
7277 void *data;
7278
Benjamin Petersonbac79492012-01-14 13:34:47 -05007279 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007280 Py_DECREF(rep);
7281 goto error;
7282 }
7283
7284 outsize = PyUnicode_GET_LENGTH(rep);
7285 if (outsize != 1) {
7286 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7287 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7288 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7289 Py_DECREF(rep);
7290 goto error;
7291 }
7292 out = PyBytes_AS_STRING(*outbytes) + offset;
7293 }
7294 kind = PyUnicode_KIND(rep);
7295 data = PyUnicode_DATA(rep);
7296 for (i=0; i < outsize; i++) {
7297 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7298 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007299 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007300 encoding, unicode,
7301 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007302 "unable to encode error handler result to ASCII");
7303 Py_DECREF(rep);
7304 goto error;
7305 }
7306 *out = (unsigned char)ch;
7307 out++;
7308 }
7309 }
7310 Py_DECREF(rep);
7311 }
7312 /* write a NUL byte */
7313 *out = 0;
7314 outsize = out - PyBytes_AS_STRING(*outbytes);
7315 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7316 if (_PyBytes_Resize(outbytes, outsize) < 0)
7317 goto error;
7318 ret = 0;
7319
7320error:
7321 Py_XDECREF(encoding_obj);
7322 Py_XDECREF(errorHandler);
7323 Py_XDECREF(exc);
7324 return ret;
7325}
7326
Victor Stinner3a50e702011-10-18 21:21:00 +02007327static PyObject *
7328encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007329 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007330 const char *errors)
7331{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007332 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007333 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007334 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007335 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007336
Benjamin Petersonbac79492012-01-14 13:34:47 -05007337 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007338 return NULL;
7339 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007340
Victor Stinner3a50e702011-10-18 21:21:00 +02007341 if (code_page < 0) {
7342 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7343 return NULL;
7344 }
7345
Martin v. Löwis3d325192011-11-04 18:23:06 +01007346 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007347 return PyBytes_FromStringAndSize(NULL, 0);
7348
Victor Stinner7581cef2011-11-03 22:32:33 +01007349 offset = 0;
7350 do
7351 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007352#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007353 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007354 chunks. */
7355 if (len > INT_MAX/2) {
7356 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007357 done = 0;
7358 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007359 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007360#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007361 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007362 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007363 done = 1;
7364 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007365
Victor Stinner76a31a62011-11-04 00:05:13 +01007366 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007367 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007368 errors);
7369 if (ret == -2)
7370 ret = encode_code_page_errors(code_page, &outbytes,
7371 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007372 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007373 if (ret < 0) {
7374 Py_XDECREF(outbytes);
7375 return NULL;
7376 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007377
Victor Stinner7581cef2011-11-03 22:32:33 +01007378 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007379 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007380 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007381
Victor Stinner3a50e702011-10-18 21:21:00 +02007382 return outbytes;
7383}
7384
7385PyObject *
7386PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7387 Py_ssize_t size,
7388 const char *errors)
7389{
Victor Stinner7581cef2011-11-03 22:32:33 +01007390 PyObject *unicode, *res;
7391 unicode = PyUnicode_FromUnicode(p, size);
7392 if (unicode == NULL)
7393 return NULL;
7394 res = encode_code_page(CP_ACP, unicode, errors);
7395 Py_DECREF(unicode);
7396 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007397}
7398
7399PyObject *
7400PyUnicode_EncodeCodePage(int code_page,
7401 PyObject *unicode,
7402 const char *errors)
7403{
Victor Stinner7581cef2011-11-03 22:32:33 +01007404 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007405}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007406
Alexander Belopolsky40018472011-02-26 01:02:56 +00007407PyObject *
7408PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007409{
7410 if (!PyUnicode_Check(unicode)) {
7411 PyErr_BadArgument();
7412 return NULL;
7413 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007414 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007415}
7416
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007417#undef NEED_RETRY
7418
Victor Stinner99b95382011-07-04 14:23:54 +02007419#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007420
Guido van Rossumd57fd912000-03-10 22:53:23 +00007421/* --- Character Mapping Codec -------------------------------------------- */
7422
Alexander Belopolsky40018472011-02-26 01:02:56 +00007423PyObject *
7424PyUnicode_DecodeCharmap(const char *s,
7425 Py_ssize_t size,
7426 PyObject *mapping,
7427 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007428{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007429 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007430 Py_ssize_t startinpos;
7431 Py_ssize_t endinpos;
7432 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007433 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007434 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007435 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007436 PyObject *errorHandler = NULL;
7437 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007438
Guido van Rossumd57fd912000-03-10 22:53:23 +00007439 /* Default to Latin-1 */
7440 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007441 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007442
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007443 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007444 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007445 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007446 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007447 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007448 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007449 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007450 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007451 Py_ssize_t maplen;
7452 enum PyUnicode_Kind kind;
7453 void *data;
7454 Py_UCS4 x;
7455
Benjamin Petersonbac79492012-01-14 13:34:47 -05007456 if (PyUnicode_READY(mapping) == -1)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007457 return NULL;
7458
7459 maplen = PyUnicode_GET_LENGTH(mapping);
7460 data = PyUnicode_DATA(mapping);
7461 kind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007462 while (s < e) {
7463 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007464
Benjamin Peterson29060642009-01-31 22:14:21 +00007465 if (ch < maplen)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007466 x = PyUnicode_READ(kind, data, ch);
7467 else
7468 x = 0xfffe; /* invalid value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007469
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007470 if (x == 0xfffe)
7471 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007472 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007473 startinpos = s-starts;
7474 endinpos = startinpos+1;
7475 if (unicode_decode_call_errorhandler(
7476 errors, &errorHandler,
7477 "charmap", "character maps to <undefined>",
7478 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007479 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007480 goto onError;
7481 }
7482 continue;
7483 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007484
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007485 if (unicode_putchar(&v, &outpos, x) < 0)
7486 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007487 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007488 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007489 }
7490 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007491 while (s < e) {
7492 unsigned char ch = *s;
7493 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007494
Benjamin Peterson29060642009-01-31 22:14:21 +00007495 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7496 w = PyLong_FromLong((long)ch);
7497 if (w == NULL)
7498 goto onError;
7499 x = PyObject_GetItem(mapping, w);
7500 Py_DECREF(w);
7501 if (x == NULL) {
7502 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7503 /* No mapping found means: mapping is undefined. */
7504 PyErr_Clear();
7505 x = Py_None;
7506 Py_INCREF(x);
7507 } else
7508 goto onError;
7509 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007510
Benjamin Peterson29060642009-01-31 22:14:21 +00007511 /* Apply mapping */
7512 if (PyLong_Check(x)) {
7513 long value = PyLong_AS_LONG(x);
7514 if (value < 0 || value > 65535) {
7515 PyErr_SetString(PyExc_TypeError,
7516 "character mapping must be in range(65536)");
7517 Py_DECREF(x);
7518 goto onError;
7519 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007520 if (unicode_putchar(&v, &outpos, value) < 0)
7521 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007522 }
7523 else if (x == Py_None) {
7524 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007525 startinpos = s-starts;
7526 endinpos = startinpos+1;
7527 if (unicode_decode_call_errorhandler(
7528 errors, &errorHandler,
7529 "charmap", "character maps to <undefined>",
7530 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007531 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007532 Py_DECREF(x);
7533 goto onError;
7534 }
7535 Py_DECREF(x);
7536 continue;
7537 }
7538 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007539 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007540
Benjamin Petersonbac79492012-01-14 13:34:47 -05007541 if (PyUnicode_READY(x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007542 goto onError;
7543 targetsize = PyUnicode_GET_LENGTH(x);
7544
7545 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007546 /* 1-1 mapping */
Victor Stinner62aa4d02011-11-09 00:03:45 +01007547 if (unicode_putchar(&v, &outpos,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007548 PyUnicode_READ_CHAR(x, 0)) < 0)
7549 goto onError;
7550 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007551 else if (targetsize > 1) {
7552 /* 1-n mapping */
7553 if (targetsize > extrachars) {
7554 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007555 Py_ssize_t needed = (targetsize - extrachars) + \
7556 (targetsize << 2);
7557 extrachars += needed;
7558 /* XXX overflow detection missing */
Victor Stinner16e6a802011-12-12 13:24:15 +01007559 if (unicode_resize(&v,
7560 PyUnicode_GET_LENGTH(v) + needed) < 0)
7561 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007562 Py_DECREF(x);
7563 goto onError;
7564 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007565 }
Victor Stinner1b487b42012-05-03 12:29:04 +02007566 if (unicode_widen(&v, outpos, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007567 goto onError;
7568 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7569 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00007570 extrachars -= targetsize;
7571 }
7572 /* 1-0 mapping: skip the character */
7573 }
7574 else {
7575 /* wrong return value */
7576 PyErr_SetString(PyExc_TypeError,
7577 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007578 Py_DECREF(x);
7579 goto onError;
7580 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007581 Py_DECREF(x);
7582 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007583 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007584 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007585 if (unicode_resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01007586 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007587 Py_XDECREF(errorHandler);
7588 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007589 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00007590
Benjamin Peterson29060642009-01-31 22:14:21 +00007591 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007592 Py_XDECREF(errorHandler);
7593 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007594 Py_XDECREF(v);
7595 return NULL;
7596}
7597
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007598/* Charmap encoding: the lookup table */
7599
Alexander Belopolsky40018472011-02-26 01:02:56 +00007600struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007601 PyObject_HEAD
7602 unsigned char level1[32];
7603 int count2, count3;
7604 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007605};
7606
7607static PyObject*
7608encoding_map_size(PyObject *obj, PyObject* args)
7609{
7610 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007611 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007612 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007613}
7614
7615static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007616 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007617 PyDoc_STR("Return the size (in bytes) of this object") },
7618 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007619};
7620
7621static void
7622encoding_map_dealloc(PyObject* o)
7623{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007624 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007625}
7626
7627static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007628 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007629 "EncodingMap", /*tp_name*/
7630 sizeof(struct encoding_map), /*tp_basicsize*/
7631 0, /*tp_itemsize*/
7632 /* methods */
7633 encoding_map_dealloc, /*tp_dealloc*/
7634 0, /*tp_print*/
7635 0, /*tp_getattr*/
7636 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007637 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007638 0, /*tp_repr*/
7639 0, /*tp_as_number*/
7640 0, /*tp_as_sequence*/
7641 0, /*tp_as_mapping*/
7642 0, /*tp_hash*/
7643 0, /*tp_call*/
7644 0, /*tp_str*/
7645 0, /*tp_getattro*/
7646 0, /*tp_setattro*/
7647 0, /*tp_as_buffer*/
7648 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7649 0, /*tp_doc*/
7650 0, /*tp_traverse*/
7651 0, /*tp_clear*/
7652 0, /*tp_richcompare*/
7653 0, /*tp_weaklistoffset*/
7654 0, /*tp_iter*/
7655 0, /*tp_iternext*/
7656 encoding_map_methods, /*tp_methods*/
7657 0, /*tp_members*/
7658 0, /*tp_getset*/
7659 0, /*tp_base*/
7660 0, /*tp_dict*/
7661 0, /*tp_descr_get*/
7662 0, /*tp_descr_set*/
7663 0, /*tp_dictoffset*/
7664 0, /*tp_init*/
7665 0, /*tp_alloc*/
7666 0, /*tp_new*/
7667 0, /*tp_free*/
7668 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007669};
7670
7671PyObject*
7672PyUnicode_BuildEncodingMap(PyObject* string)
7673{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007674 PyObject *result;
7675 struct encoding_map *mresult;
7676 int i;
7677 int need_dict = 0;
7678 unsigned char level1[32];
7679 unsigned char level2[512];
7680 unsigned char *mlevel1, *mlevel2, *mlevel3;
7681 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007682 int kind;
7683 void *data;
7684 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007685
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007686 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007687 PyErr_BadArgument();
7688 return NULL;
7689 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007690 kind = PyUnicode_KIND(string);
7691 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007692 memset(level1, 0xFF, sizeof level1);
7693 memset(level2, 0xFF, sizeof level2);
7694
7695 /* If there isn't a one-to-one mapping of NULL to \0,
7696 or if there are non-BMP characters, we need to use
7697 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007698 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007699 need_dict = 1;
7700 for (i = 1; i < 256; i++) {
7701 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007702 ch = PyUnicode_READ(kind, data, i);
7703 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007704 need_dict = 1;
7705 break;
7706 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007707 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007708 /* unmapped character */
7709 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007710 l1 = ch >> 11;
7711 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007712 if (level1[l1] == 0xFF)
7713 level1[l1] = count2++;
7714 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007715 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007716 }
7717
7718 if (count2 >= 0xFF || count3 >= 0xFF)
7719 need_dict = 1;
7720
7721 if (need_dict) {
7722 PyObject *result = PyDict_New();
7723 PyObject *key, *value;
7724 if (!result)
7725 return NULL;
7726 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007727 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007728 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007729 if (!key || !value)
7730 goto failed1;
7731 if (PyDict_SetItem(result, key, value) == -1)
7732 goto failed1;
7733 Py_DECREF(key);
7734 Py_DECREF(value);
7735 }
7736 return result;
7737 failed1:
7738 Py_XDECREF(key);
7739 Py_XDECREF(value);
7740 Py_DECREF(result);
7741 return NULL;
7742 }
7743
7744 /* Create a three-level trie */
7745 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7746 16*count2 + 128*count3 - 1);
7747 if (!result)
7748 return PyErr_NoMemory();
7749 PyObject_Init(result, &EncodingMapType);
7750 mresult = (struct encoding_map*)result;
7751 mresult->count2 = count2;
7752 mresult->count3 = count3;
7753 mlevel1 = mresult->level1;
7754 mlevel2 = mresult->level23;
7755 mlevel3 = mresult->level23 + 16*count2;
7756 memcpy(mlevel1, level1, 32);
7757 memset(mlevel2, 0xFF, 16*count2);
7758 memset(mlevel3, 0, 128*count3);
7759 count3 = 0;
7760 for (i = 1; i < 256; i++) {
7761 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007762 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007763 /* unmapped character */
7764 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007765 o1 = PyUnicode_READ(kind, data, i)>>11;
7766 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007767 i2 = 16*mlevel1[o1] + o2;
7768 if (mlevel2[i2] == 0xFF)
7769 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007770 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007771 i3 = 128*mlevel2[i2] + o3;
7772 mlevel3[i3] = i;
7773 }
7774 return result;
7775}
7776
7777static int
Victor Stinner22168992011-11-20 17:09:18 +01007778encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007779{
7780 struct encoding_map *map = (struct encoding_map*)mapping;
7781 int l1 = c>>11;
7782 int l2 = (c>>7) & 0xF;
7783 int l3 = c & 0x7F;
7784 int i;
7785
Victor Stinner22168992011-11-20 17:09:18 +01007786 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007787 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007788 if (c == 0)
7789 return 0;
7790 /* level 1*/
7791 i = map->level1[l1];
7792 if (i == 0xFF) {
7793 return -1;
7794 }
7795 /* level 2*/
7796 i = map->level23[16*i+l2];
7797 if (i == 0xFF) {
7798 return -1;
7799 }
7800 /* level 3 */
7801 i = map->level23[16*map->count2 + 128*i + l3];
7802 if (i == 0) {
7803 return -1;
7804 }
7805 return i;
7806}
7807
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007808/* Lookup the character ch in the mapping. If the character
7809 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007810 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007811static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007812charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007813{
Christian Heimes217cfd12007-12-02 14:31:20 +00007814 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007815 PyObject *x;
7816
7817 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007818 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007819 x = PyObject_GetItem(mapping, w);
7820 Py_DECREF(w);
7821 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007822 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7823 /* No mapping found means: mapping is undefined. */
7824 PyErr_Clear();
7825 x = Py_None;
7826 Py_INCREF(x);
7827 return x;
7828 } else
7829 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007830 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007831 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007832 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007833 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007834 long value = PyLong_AS_LONG(x);
7835 if (value < 0 || value > 255) {
7836 PyErr_SetString(PyExc_TypeError,
7837 "character mapping must be in range(256)");
7838 Py_DECREF(x);
7839 return NULL;
7840 }
7841 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007842 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007843 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007844 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007845 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007846 /* wrong return value */
7847 PyErr_Format(PyExc_TypeError,
7848 "character mapping must return integer, bytes or None, not %.400s",
7849 x->ob_type->tp_name);
7850 Py_DECREF(x);
7851 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007852 }
7853}
7854
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007855static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007856charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007857{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007858 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7859 /* exponentially overallocate to minimize reallocations */
7860 if (requiredsize < 2*outsize)
7861 requiredsize = 2*outsize;
7862 if (_PyBytes_Resize(outobj, requiredsize))
7863 return -1;
7864 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007865}
7866
Benjamin Peterson14339b62009-01-31 16:36:08 +00007867typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007868 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007869} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007870/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007871 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007872 space is available. Return a new reference to the object that
7873 was put in the output buffer, or Py_None, if the mapping was undefined
7874 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007875 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007876static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01007877charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007878 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007879{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007880 PyObject *rep;
7881 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007882 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007883
Christian Heimes90aa7642007-12-19 02:45:37 +00007884 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007885 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007886 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007887 if (res == -1)
7888 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007889 if (outsize<requiredsize)
7890 if (charmapencode_resize(outobj, outpos, requiredsize))
7891 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007892 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007893 outstart[(*outpos)++] = (char)res;
7894 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007895 }
7896
7897 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007898 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007899 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007900 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007901 Py_DECREF(rep);
7902 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007903 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007904 if (PyLong_Check(rep)) {
7905 Py_ssize_t requiredsize = *outpos+1;
7906 if (outsize<requiredsize)
7907 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7908 Py_DECREF(rep);
7909 return enc_EXCEPTION;
7910 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007911 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007912 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007913 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007914 else {
7915 const char *repchars = PyBytes_AS_STRING(rep);
7916 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7917 Py_ssize_t requiredsize = *outpos+repsize;
7918 if (outsize<requiredsize)
7919 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7920 Py_DECREF(rep);
7921 return enc_EXCEPTION;
7922 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007923 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007924 memcpy(outstart + *outpos, repchars, repsize);
7925 *outpos += repsize;
7926 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007927 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007928 Py_DECREF(rep);
7929 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007930}
7931
7932/* handle an error in PyUnicode_EncodeCharmap
7933 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007934static int
7935charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007936 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007937 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007938 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007939 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007940{
7941 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007942 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007943 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007944 enum PyUnicode_Kind kind;
7945 void *data;
7946 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007947 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007948 Py_ssize_t collstartpos = *inpos;
7949 Py_ssize_t collendpos = *inpos+1;
7950 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007951 char *encoding = "charmap";
7952 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007953 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007954 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05007955 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007956
Benjamin Petersonbac79492012-01-14 13:34:47 -05007957 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007958 return -1;
7959 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007960 /* find all unencodable characters */
7961 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007962 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007963 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007964 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05007965 val = encoding_map_lookup(ch, mapping);
7966 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007967 break;
7968 ++collendpos;
7969 continue;
7970 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007971
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007972 ch = PyUnicode_READ_CHAR(unicode, collendpos);
7973 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007974 if (rep==NULL)
7975 return -1;
7976 else if (rep!=Py_None) {
7977 Py_DECREF(rep);
7978 break;
7979 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007980 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007981 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007982 }
7983 /* cache callback name lookup
7984 * (if not done yet, i.e. it's the first error) */
7985 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007986 if ((errors==NULL) || (!strcmp(errors, "strict")))
7987 *known_errorHandler = 1;
7988 else if (!strcmp(errors, "replace"))
7989 *known_errorHandler = 2;
7990 else if (!strcmp(errors, "ignore"))
7991 *known_errorHandler = 3;
7992 else if (!strcmp(errors, "xmlcharrefreplace"))
7993 *known_errorHandler = 4;
7994 else
7995 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007996 }
7997 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007998 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007999 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008000 return -1;
8001 case 2: /* replace */
8002 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008003 x = charmapencode_output('?', mapping, res, respos);
8004 if (x==enc_EXCEPTION) {
8005 return -1;
8006 }
8007 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008008 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008009 return -1;
8010 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008011 }
8012 /* fall through */
8013 case 3: /* ignore */
8014 *inpos = collendpos;
8015 break;
8016 case 4: /* xmlcharrefreplace */
8017 /* generate replacement (temporarily (mis)uses p) */
8018 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008019 char buffer[2+29+1+1];
8020 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008021 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008022 for (cp = buffer; *cp; ++cp) {
8023 x = charmapencode_output(*cp, mapping, res, respos);
8024 if (x==enc_EXCEPTION)
8025 return -1;
8026 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008027 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008028 return -1;
8029 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008030 }
8031 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008032 *inpos = collendpos;
8033 break;
8034 default:
8035 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008036 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008037 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008038 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008039 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008040 if (PyBytes_Check(repunicode)) {
8041 /* Directly copy bytes result to output. */
8042 Py_ssize_t outsize = PyBytes_Size(*res);
8043 Py_ssize_t requiredsize;
8044 repsize = PyBytes_Size(repunicode);
8045 requiredsize = *respos + repsize;
8046 if (requiredsize > outsize)
8047 /* Make room for all additional bytes. */
8048 if (charmapencode_resize(res, respos, requiredsize)) {
8049 Py_DECREF(repunicode);
8050 return -1;
8051 }
8052 memcpy(PyBytes_AsString(*res) + *respos,
8053 PyBytes_AsString(repunicode), repsize);
8054 *respos += repsize;
8055 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008056 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008057 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008058 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008059 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008060 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008061 Py_DECREF(repunicode);
8062 return -1;
8063 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008064 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008065 data = PyUnicode_DATA(repunicode);
8066 kind = PyUnicode_KIND(repunicode);
8067 for (index = 0; index < repsize; index++) {
8068 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8069 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008070 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008071 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008072 return -1;
8073 }
8074 else if (x==enc_FAILED) {
8075 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008076 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008077 return -1;
8078 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008079 }
8080 *inpos = newpos;
8081 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008082 }
8083 return 0;
8084}
8085
Alexander Belopolsky40018472011-02-26 01:02:56 +00008086PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008087_PyUnicode_EncodeCharmap(PyObject *unicode,
8088 PyObject *mapping,
8089 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008090{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008091 /* output object */
8092 PyObject *res = NULL;
8093 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008094 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008095 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008096 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008097 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008098 PyObject *errorHandler = NULL;
8099 PyObject *exc = NULL;
8100 /* the following variable is used for caching string comparisons
8101 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8102 * 3=ignore, 4=xmlcharrefreplace */
8103 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008104
Benjamin Petersonbac79492012-01-14 13:34:47 -05008105 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008106 return NULL;
8107 size = PyUnicode_GET_LENGTH(unicode);
8108
Guido van Rossumd57fd912000-03-10 22:53:23 +00008109 /* Default to Latin-1 */
8110 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008111 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008112
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008113 /* allocate enough for a simple encoding without
8114 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008115 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008116 if (res == NULL)
8117 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008118 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008119 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008120
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008121 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008122 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008123 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008124 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008125 if (x==enc_EXCEPTION) /* error */
8126 goto onError;
8127 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008128 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008129 &exc,
8130 &known_errorHandler, &errorHandler, errors,
8131 &res, &respos)) {
8132 goto onError;
8133 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008134 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008135 else
8136 /* done with this character => adjust input position */
8137 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008138 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008139
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008140 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008141 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008142 if (_PyBytes_Resize(&res, respos) < 0)
8143 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008144
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008145 Py_XDECREF(exc);
8146 Py_XDECREF(errorHandler);
8147 return res;
8148
Benjamin Peterson29060642009-01-31 22:14:21 +00008149 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008150 Py_XDECREF(res);
8151 Py_XDECREF(exc);
8152 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008153 return NULL;
8154}
8155
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008156/* Deprecated */
8157PyObject *
8158PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8159 Py_ssize_t size,
8160 PyObject *mapping,
8161 const char *errors)
8162{
8163 PyObject *result;
8164 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8165 if (unicode == NULL)
8166 return NULL;
8167 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8168 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008169 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008170}
8171
Alexander Belopolsky40018472011-02-26 01:02:56 +00008172PyObject *
8173PyUnicode_AsCharmapString(PyObject *unicode,
8174 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008175{
8176 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008177 PyErr_BadArgument();
8178 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008179 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008180 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008181}
8182
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008183/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008184static void
8185make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008186 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008187 Py_ssize_t startpos, Py_ssize_t endpos,
8188 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008189{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008190 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008191 *exceptionObject = _PyUnicodeTranslateError_Create(
8192 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008193 }
8194 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008195 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8196 goto onError;
8197 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8198 goto onError;
8199 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8200 goto onError;
8201 return;
8202 onError:
8203 Py_DECREF(*exceptionObject);
8204 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008205 }
8206}
8207
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008208/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008209static void
8210raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008211 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008212 Py_ssize_t startpos, Py_ssize_t endpos,
8213 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008214{
8215 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008216 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008217 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008218 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008219}
8220
8221/* error handling callback helper:
8222 build arguments, call the callback and check the arguments,
8223 put the result into newpos and return the replacement string, which
8224 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008225static PyObject *
8226unicode_translate_call_errorhandler(const char *errors,
8227 PyObject **errorHandler,
8228 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008229 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008230 Py_ssize_t startpos, Py_ssize_t endpos,
8231 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008232{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008233 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008234
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008235 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008236 PyObject *restuple;
8237 PyObject *resunicode;
8238
8239 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008240 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008241 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008242 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008243 }
8244
8245 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008246 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008247 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008248 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008249
8250 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008251 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008252 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008253 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008254 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008255 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008256 Py_DECREF(restuple);
8257 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008258 }
8259 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008260 &resunicode, &i_newpos)) {
8261 Py_DECREF(restuple);
8262 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008263 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008264 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008265 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008266 else
8267 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008268 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008269 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8270 Py_DECREF(restuple);
8271 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008272 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008273 Py_INCREF(resunicode);
8274 Py_DECREF(restuple);
8275 return resunicode;
8276}
8277
8278/* Lookup the character ch in the mapping and put the result in result,
8279 which must be decrefed by the caller.
8280 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008281static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008282charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008283{
Christian Heimes217cfd12007-12-02 14:31:20 +00008284 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008285 PyObject *x;
8286
8287 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008288 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008289 x = PyObject_GetItem(mapping, w);
8290 Py_DECREF(w);
8291 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008292 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8293 /* No mapping found means: use 1:1 mapping. */
8294 PyErr_Clear();
8295 *result = NULL;
8296 return 0;
8297 } else
8298 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008299 }
8300 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008301 *result = x;
8302 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008303 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008304 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008305 long value = PyLong_AS_LONG(x);
8306 long max = PyUnicode_GetMax();
8307 if (value < 0 || value > max) {
8308 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008309 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008310 Py_DECREF(x);
8311 return -1;
8312 }
8313 *result = x;
8314 return 0;
8315 }
8316 else if (PyUnicode_Check(x)) {
8317 *result = x;
8318 return 0;
8319 }
8320 else {
8321 /* wrong return value */
8322 PyErr_SetString(PyExc_TypeError,
8323 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008324 Py_DECREF(x);
8325 return -1;
8326 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008327}
8328/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008329 if not reallocate and adjust various state variables.
8330 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008331static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008332charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008333 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008334{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008335 Py_ssize_t oldsize = *psize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008336 Py_UCS4 *new_outobj;
Walter Dörwald4894c302003-10-24 14:25:28 +00008337 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008338 /* exponentially overallocate to minimize reallocations */
8339 if (requiredsize < 2 * oldsize)
8340 requiredsize = 2 * oldsize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008341 new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8342 if (new_outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008343 return -1;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008344 *outobj = new_outobj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008345 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008346 }
8347 return 0;
8348}
8349/* lookup the character, put the result in the output string and adjust
8350 various state variables. Return a new reference to the object that
8351 was put in the output buffer in *result, or Py_None, if the mapping was
8352 undefined (in which case no character was written).
8353 The called must decref result.
8354 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008355static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008356charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8357 PyObject *mapping, Py_UCS4 **output,
8358 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008359 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008360{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008361 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8362 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008363 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008364 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008365 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008366 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008367 }
8368 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008369 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008370 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008371 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008372 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008373 }
8374 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008375 Py_ssize_t repsize;
8376 if (PyUnicode_READY(*res) == -1)
8377 return -1;
8378 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008379 if (repsize==1) {
8380 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008381 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008382 }
8383 else if (repsize!=0) {
8384 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008385 Py_ssize_t requiredsize = *opos +
8386 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008387 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008388 Py_ssize_t i;
8389 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008390 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008391 for(i = 0; i < repsize; i++)
8392 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008393 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008394 }
8395 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008396 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008397 return 0;
8398}
8399
Alexander Belopolsky40018472011-02-26 01:02:56 +00008400PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008401_PyUnicode_TranslateCharmap(PyObject *input,
8402 PyObject *mapping,
8403 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008404{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008405 /* input object */
8406 char *idata;
8407 Py_ssize_t size, i;
8408 int kind;
8409 /* output buffer */
8410 Py_UCS4 *output = NULL;
8411 Py_ssize_t osize;
8412 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008413 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008414 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008415 char *reason = "character maps to <undefined>";
8416 PyObject *errorHandler = NULL;
8417 PyObject *exc = NULL;
8418 /* the following variable is used for caching string comparisons
8419 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8420 * 3=ignore, 4=xmlcharrefreplace */
8421 int known_errorHandler = -1;
8422
Guido van Rossumd57fd912000-03-10 22:53:23 +00008423 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008424 PyErr_BadArgument();
8425 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008426 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008427
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008428 if (PyUnicode_READY(input) == -1)
8429 return NULL;
8430 idata = (char*)PyUnicode_DATA(input);
8431 kind = PyUnicode_KIND(input);
8432 size = PyUnicode_GET_LENGTH(input);
8433 i = 0;
8434
8435 if (size == 0) {
8436 Py_INCREF(input);
8437 return input;
8438 }
8439
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008440 /* allocate enough for a simple 1:1 translation without
8441 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008442 osize = size;
8443 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8444 opos = 0;
8445 if (output == NULL) {
8446 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008447 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008448 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008449
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008450 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008451 /* try to encode it */
8452 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008453 if (charmaptranslate_output(input, i, mapping,
8454 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008455 Py_XDECREF(x);
8456 goto onError;
8457 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008458 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008459 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008460 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008461 else { /* untranslatable character */
8462 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8463 Py_ssize_t repsize;
8464 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008465 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008466 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008467 Py_ssize_t collstart = i;
8468 Py_ssize_t collend = i+1;
8469 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008470
Benjamin Peterson29060642009-01-31 22:14:21 +00008471 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008472 while (collend < size) {
8473 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008474 goto onError;
8475 Py_XDECREF(x);
8476 if (x!=Py_None)
8477 break;
8478 ++collend;
8479 }
8480 /* cache callback name lookup
8481 * (if not done yet, i.e. it's the first error) */
8482 if (known_errorHandler==-1) {
8483 if ((errors==NULL) || (!strcmp(errors, "strict")))
8484 known_errorHandler = 1;
8485 else if (!strcmp(errors, "replace"))
8486 known_errorHandler = 2;
8487 else if (!strcmp(errors, "ignore"))
8488 known_errorHandler = 3;
8489 else if (!strcmp(errors, "xmlcharrefreplace"))
8490 known_errorHandler = 4;
8491 else
8492 known_errorHandler = 0;
8493 }
8494 switch (known_errorHandler) {
8495 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008496 raise_translate_exception(&exc, input, collstart,
8497 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008498 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008499 case 2: /* replace */
8500 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008501 for (coll = collstart; coll<collend; coll++)
8502 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008503 /* fall through */
8504 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008505 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008506 break;
8507 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008508 /* generate replacement (temporarily (mis)uses i) */
8509 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008510 char buffer[2+29+1+1];
8511 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008512 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8513 if (charmaptranslate_makespace(&output, &osize,
8514 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008515 goto onError;
8516 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008517 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008518 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008519 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008520 break;
8521 default:
8522 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008523 reason, input, &exc,
8524 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008525 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008526 goto onError;
Benjamin Peterson9ca3ffa2012-01-01 16:04:29 -06008527 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008528 Py_DECREF(repunicode);
8529 goto onError;
8530 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008531 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008532 repsize = PyUnicode_GET_LENGTH(repunicode);
8533 if (charmaptranslate_makespace(&output, &osize,
8534 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008535 Py_DECREF(repunicode);
8536 goto onError;
8537 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008538 for (uni2 = 0; repsize-->0; ++uni2)
8539 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8540 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008541 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008542 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008543 }
8544 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008545 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8546 if (!res)
8547 goto onError;
8548 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008549 Py_XDECREF(exc);
8550 Py_XDECREF(errorHandler);
8551 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008552
Benjamin Peterson29060642009-01-31 22:14:21 +00008553 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008554 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008555 Py_XDECREF(exc);
8556 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008557 return NULL;
8558}
8559
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008560/* Deprecated. Use PyUnicode_Translate instead. */
8561PyObject *
8562PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8563 Py_ssize_t size,
8564 PyObject *mapping,
8565 const char *errors)
8566{
8567 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8568 if (!unicode)
8569 return NULL;
8570 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8571}
8572
Alexander Belopolsky40018472011-02-26 01:02:56 +00008573PyObject *
8574PyUnicode_Translate(PyObject *str,
8575 PyObject *mapping,
8576 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008577{
8578 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008579
Guido van Rossumd57fd912000-03-10 22:53:23 +00008580 str = PyUnicode_FromObject(str);
8581 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008582 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008583 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008584 Py_DECREF(str);
8585 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008586
Benjamin Peterson29060642009-01-31 22:14:21 +00008587 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008588 Py_XDECREF(str);
8589 return NULL;
8590}
Tim Petersced69f82003-09-16 20:30:58 +00008591
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008592static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008593fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008594{
8595 /* No need to call PyUnicode_READY(self) because this function is only
8596 called as a callback from fixup() which does it already. */
8597 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8598 const int kind = PyUnicode_KIND(self);
8599 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02008600 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008601 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008602 Py_ssize_t i;
8603
8604 for (i = 0; i < len; ++i) {
8605 ch = PyUnicode_READ(kind, data, i);
8606 fixed = 0;
8607 if (ch > 127) {
8608 if (Py_UNICODE_ISSPACE(ch))
8609 fixed = ' ';
8610 else {
8611 const int decimal = Py_UNICODE_TODECIMAL(ch);
8612 if (decimal >= 0)
8613 fixed = '0' + decimal;
8614 }
8615 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008616 modified = 1;
Victor Stinnere6abb482012-05-02 01:15:40 +02008617 maxchar = MAX_MAXCHAR(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008618 PyUnicode_WRITE(kind, data, i, fixed);
8619 }
Victor Stinnere6abb482012-05-02 01:15:40 +02008620 else
8621 maxchar = MAX_MAXCHAR(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008622 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008623 }
8624
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008625 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008626}
8627
8628PyObject *
8629_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8630{
8631 if (!PyUnicode_Check(unicode)) {
8632 PyErr_BadInternalCall();
8633 return NULL;
8634 }
8635 if (PyUnicode_READY(unicode) == -1)
8636 return NULL;
8637 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8638 /* If the string is already ASCII, just return the same string */
8639 Py_INCREF(unicode);
8640 return unicode;
8641 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008642 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008643}
8644
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008645PyObject *
8646PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8647 Py_ssize_t length)
8648{
Victor Stinnerf0124502011-11-21 23:12:56 +01008649 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008650 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008651 Py_UCS4 maxchar;
8652 enum PyUnicode_Kind kind;
8653 void *data;
8654
Victor Stinner99d7ad02012-02-22 13:37:39 +01008655 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008656 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008657 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008658 if (ch > 127) {
8659 int decimal = Py_UNICODE_TODECIMAL(ch);
8660 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008661 ch = '0' + decimal;
Victor Stinnere6abb482012-05-02 01:15:40 +02008662 maxchar = MAX_MAXCHAR(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008663 }
8664 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008665
8666 /* Copy to a new string */
8667 decimal = PyUnicode_New(length, maxchar);
8668 if (decimal == NULL)
8669 return decimal;
8670 kind = PyUnicode_KIND(decimal);
8671 data = PyUnicode_DATA(decimal);
8672 /* Iterate over code points */
8673 for (i = 0; i < length; i++) {
8674 Py_UNICODE ch = s[i];
8675 if (ch > 127) {
8676 int decimal = Py_UNICODE_TODECIMAL(ch);
8677 if (decimal >= 0)
8678 ch = '0' + decimal;
8679 }
8680 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008681 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008682 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008683}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008684/* --- Decimal Encoder ---------------------------------------------------- */
8685
Alexander Belopolsky40018472011-02-26 01:02:56 +00008686int
8687PyUnicode_EncodeDecimal(Py_UNICODE *s,
8688 Py_ssize_t length,
8689 char *output,
8690 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008691{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008692 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008693 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008694 enum PyUnicode_Kind kind;
8695 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008696
8697 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008698 PyErr_BadArgument();
8699 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008700 }
8701
Victor Stinner42bf7752011-11-21 22:52:58 +01008702 unicode = PyUnicode_FromUnicode(s, length);
8703 if (unicode == NULL)
8704 return -1;
8705
Benjamin Petersonbac79492012-01-14 13:34:47 -05008706 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008707 Py_DECREF(unicode);
8708 return -1;
8709 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008710 kind = PyUnicode_KIND(unicode);
8711 data = PyUnicode_DATA(unicode);
8712
Victor Stinnerb84d7232011-11-22 01:50:07 +01008713 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008714 PyObject *exc;
8715 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008716 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008717 Py_ssize_t startpos;
8718
8719 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008720
Benjamin Peterson29060642009-01-31 22:14:21 +00008721 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008722 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008723 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008724 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008725 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008726 decimal = Py_UNICODE_TODECIMAL(ch);
8727 if (decimal >= 0) {
8728 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008729 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008730 continue;
8731 }
8732 if (0 < ch && ch < 256) {
8733 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008734 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008735 continue;
8736 }
Victor Stinner6345be92011-11-25 20:09:01 +01008737
Victor Stinner42bf7752011-11-21 22:52:58 +01008738 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008739 exc = NULL;
8740 raise_encode_exception(&exc, "decimal", unicode,
8741 startpos, startpos+1,
8742 "invalid decimal Unicode string");
8743 Py_XDECREF(exc);
8744 Py_DECREF(unicode);
8745 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008746 }
8747 /* 0-terminate the output string */
8748 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008749 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008750 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008751}
8752
Guido van Rossumd57fd912000-03-10 22:53:23 +00008753/* --- Helpers ------------------------------------------------------------ */
8754
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008755static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008756any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008757 Py_ssize_t start,
8758 Py_ssize_t end)
8759{
8760 int kind1, kind2, kind;
8761 void *buf1, *buf2;
8762 Py_ssize_t len1, len2, result;
8763
8764 kind1 = PyUnicode_KIND(s1);
8765 kind2 = PyUnicode_KIND(s2);
8766 kind = kind1 > kind2 ? kind1 : kind2;
8767 buf1 = PyUnicode_DATA(s1);
8768 buf2 = PyUnicode_DATA(s2);
8769 if (kind1 != kind)
8770 buf1 = _PyUnicode_AsKind(s1, kind);
8771 if (!buf1)
8772 return -2;
8773 if (kind2 != kind)
8774 buf2 = _PyUnicode_AsKind(s2, kind);
8775 if (!buf2) {
8776 if (kind1 != kind) PyMem_Free(buf1);
8777 return -2;
8778 }
8779 len1 = PyUnicode_GET_LENGTH(s1);
8780 len2 = PyUnicode_GET_LENGTH(s2);
8781
Victor Stinner794d5672011-10-10 03:21:36 +02008782 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008783 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008784 case PyUnicode_1BYTE_KIND:
8785 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8786 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8787 else
8788 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8789 break;
8790 case PyUnicode_2BYTE_KIND:
8791 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8792 break;
8793 case PyUnicode_4BYTE_KIND:
8794 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8795 break;
8796 default:
8797 assert(0); result = -2;
8798 }
8799 }
8800 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008801 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008802 case PyUnicode_1BYTE_KIND:
8803 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8804 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8805 else
8806 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8807 break;
8808 case PyUnicode_2BYTE_KIND:
8809 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8810 break;
8811 case PyUnicode_4BYTE_KIND:
8812 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8813 break;
8814 default:
8815 assert(0); result = -2;
8816 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008817 }
8818
8819 if (kind1 != kind)
8820 PyMem_Free(buf1);
8821 if (kind2 != kind)
8822 PyMem_Free(buf2);
8823
8824 return result;
8825}
8826
8827Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01008828_PyUnicode_InsertThousandsGrouping(
8829 PyObject *unicode, Py_ssize_t index,
8830 Py_ssize_t n_buffer,
8831 void *digits, Py_ssize_t n_digits,
8832 Py_ssize_t min_width,
8833 const char *grouping, PyObject *thousands_sep,
8834 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008835{
Victor Stinner41a863c2012-02-24 00:37:51 +01008836 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008837 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01008838 Py_ssize_t thousands_sep_len;
8839 Py_ssize_t len;
8840
8841 if (unicode != NULL) {
8842 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008843 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01008844 }
8845 else {
8846 kind = PyUnicode_1BYTE_KIND;
8847 data = NULL;
8848 }
8849 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
8850 thousands_sep_data = PyUnicode_DATA(thousands_sep);
8851 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
8852 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01008853 if (thousands_sep_kind < kind) {
8854 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
8855 if (!thousands_sep_data)
8856 return -1;
8857 }
8858 else {
8859 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
8860 if (!data)
8861 return -1;
8862 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008863 }
8864
Benjamin Petersonead6b532011-12-20 17:23:42 -06008865 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008866 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008867 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01008868 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008869 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008870 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008871 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02008872 else
Victor Stinner41a863c2012-02-24 00:37:51 +01008873 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02008874 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008875 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008876 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008877 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008878 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008879 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008880 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008881 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008882 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008883 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008884 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008885 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008886 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008887 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008888 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008889 break;
8890 default:
8891 assert(0);
8892 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008893 }
Victor Stinner90f50d42012-02-24 01:44:47 +01008894 if (unicode != NULL && thousands_sep_kind != kind) {
8895 if (thousands_sep_kind < kind)
8896 PyMem_Free(thousands_sep_data);
8897 else
8898 PyMem_Free(data);
8899 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008900 if (unicode == NULL) {
8901 *maxchar = 127;
8902 if (len != n_digits) {
Victor Stinnere6abb482012-05-02 01:15:40 +02008903 *maxchar = MAX_MAXCHAR(*maxchar,
8904 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01008905 }
8906 }
8907 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008908}
8909
8910
Thomas Wouters477c8d52006-05-27 19:21:47 +00008911/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008912#define ADJUST_INDICES(start, end, len) \
8913 if (end > len) \
8914 end = len; \
8915 else if (end < 0) { \
8916 end += len; \
8917 if (end < 0) \
8918 end = 0; \
8919 } \
8920 if (start < 0) { \
8921 start += len; \
8922 if (start < 0) \
8923 start = 0; \
8924 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008925
Alexander Belopolsky40018472011-02-26 01:02:56 +00008926Py_ssize_t
8927PyUnicode_Count(PyObject *str,
8928 PyObject *substr,
8929 Py_ssize_t start,
8930 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008931{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008932 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008933 PyObject* str_obj;
8934 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008935 int kind1, kind2, kind;
8936 void *buf1 = NULL, *buf2 = NULL;
8937 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008938
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008939 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008940 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008941 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008942 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008943 if (!sub_obj) {
8944 Py_DECREF(str_obj);
8945 return -1;
8946 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06008947 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06008948 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008949 Py_DECREF(str_obj);
8950 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008951 }
Tim Petersced69f82003-09-16 20:30:58 +00008952
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008953 kind1 = PyUnicode_KIND(str_obj);
8954 kind2 = PyUnicode_KIND(sub_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008955 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008956 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008957 buf2 = PyUnicode_DATA(sub_obj);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008958 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +02008959 if (kind2 > kind) {
8960 Py_DECREF(sub_obj);
8961 Py_DECREF(str_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008962 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +02008963 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01008964 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008965 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008966 if (!buf2)
8967 goto onError;
8968 len1 = PyUnicode_GET_LENGTH(str_obj);
8969 len2 = PyUnicode_GET_LENGTH(sub_obj);
8970
8971 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06008972 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008973 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008974 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
8975 result = asciilib_count(
8976 ((Py_UCS1*)buf1) + start, end - start,
8977 buf2, len2, PY_SSIZE_T_MAX
8978 );
8979 else
8980 result = ucs1lib_count(
8981 ((Py_UCS1*)buf1) + start, end - start,
8982 buf2, len2, PY_SSIZE_T_MAX
8983 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008984 break;
8985 case PyUnicode_2BYTE_KIND:
8986 result = ucs2lib_count(
8987 ((Py_UCS2*)buf1) + start, end - start,
8988 buf2, len2, PY_SSIZE_T_MAX
8989 );
8990 break;
8991 case PyUnicode_4BYTE_KIND:
8992 result = ucs4lib_count(
8993 ((Py_UCS4*)buf1) + start, end - start,
8994 buf2, len2, PY_SSIZE_T_MAX
8995 );
8996 break;
8997 default:
8998 assert(0); result = 0;
8999 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009000
9001 Py_DECREF(sub_obj);
9002 Py_DECREF(str_obj);
9003
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009004 if (kind2 != kind)
9005 PyMem_Free(buf2);
9006
Guido van Rossumd57fd912000-03-10 22:53:23 +00009007 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009008 onError:
9009 Py_DECREF(sub_obj);
9010 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009011 if (kind2 != kind && buf2)
9012 PyMem_Free(buf2);
9013 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009014}
9015
Alexander Belopolsky40018472011-02-26 01:02:56 +00009016Py_ssize_t
9017PyUnicode_Find(PyObject *str,
9018 PyObject *sub,
9019 Py_ssize_t start,
9020 Py_ssize_t end,
9021 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009022{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009023 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009024
Guido van Rossumd57fd912000-03-10 22:53:23 +00009025 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009026 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009027 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009028 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009029 if (!sub) {
9030 Py_DECREF(str);
9031 return -2;
9032 }
9033 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9034 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009035 Py_DECREF(str);
9036 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009037 }
Tim Petersced69f82003-09-16 20:30:58 +00009038
Victor Stinner794d5672011-10-10 03:21:36 +02009039 result = any_find_slice(direction,
9040 str, sub, start, end
9041 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009042
Guido van Rossumd57fd912000-03-10 22:53:23 +00009043 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009044 Py_DECREF(sub);
9045
Guido van Rossumd57fd912000-03-10 22:53:23 +00009046 return result;
9047}
9048
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009049Py_ssize_t
9050PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9051 Py_ssize_t start, Py_ssize_t end,
9052 int direction)
9053{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009054 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009055 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009056 if (PyUnicode_READY(str) == -1)
9057 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009058 if (start < 0 || end < 0) {
9059 PyErr_SetString(PyExc_IndexError, "string index out of range");
9060 return -2;
9061 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009062 if (end > PyUnicode_GET_LENGTH(str))
9063 end = PyUnicode_GET_LENGTH(str);
9064 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009065 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9066 kind, end-start, ch, direction);
9067 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009068 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009069 else
9070 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009071}
9072
Alexander Belopolsky40018472011-02-26 01:02:56 +00009073static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009074tailmatch(PyObject *self,
9075 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009076 Py_ssize_t start,
9077 Py_ssize_t end,
9078 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009079{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009080 int kind_self;
9081 int kind_sub;
9082 void *data_self;
9083 void *data_sub;
9084 Py_ssize_t offset;
9085 Py_ssize_t i;
9086 Py_ssize_t end_sub;
9087
9088 if (PyUnicode_READY(self) == -1 ||
9089 PyUnicode_READY(substring) == -1)
9090 return 0;
9091
9092 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009093 return 1;
9094
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009095 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9096 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009097 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009098 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009099
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009100 kind_self = PyUnicode_KIND(self);
9101 data_self = PyUnicode_DATA(self);
9102 kind_sub = PyUnicode_KIND(substring);
9103 data_sub = PyUnicode_DATA(substring);
9104 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9105
9106 if (direction > 0)
9107 offset = end;
9108 else
9109 offset = start;
9110
9111 if (PyUnicode_READ(kind_self, data_self, offset) ==
9112 PyUnicode_READ(kind_sub, data_sub, 0) &&
9113 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9114 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9115 /* If both are of the same kind, memcmp is sufficient */
9116 if (kind_self == kind_sub) {
9117 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009118 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009119 data_sub,
9120 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009121 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009122 }
9123 /* otherwise we have to compare each character by first accesing it */
9124 else {
9125 /* We do not need to compare 0 and len(substring)-1 because
9126 the if statement above ensured already that they are equal
9127 when we end up here. */
9128 // TODO: honor direction and do a forward or backwards search
9129 for (i = 1; i < end_sub; ++i) {
9130 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9131 PyUnicode_READ(kind_sub, data_sub, i))
9132 return 0;
9133 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009134 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009135 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009136 }
9137
9138 return 0;
9139}
9140
Alexander Belopolsky40018472011-02-26 01:02:56 +00009141Py_ssize_t
9142PyUnicode_Tailmatch(PyObject *str,
9143 PyObject *substr,
9144 Py_ssize_t start,
9145 Py_ssize_t end,
9146 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009147{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009148 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009149
Guido van Rossumd57fd912000-03-10 22:53:23 +00009150 str = PyUnicode_FromObject(str);
9151 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009152 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009153 substr = PyUnicode_FromObject(substr);
9154 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009155 Py_DECREF(str);
9156 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009157 }
Tim Petersced69f82003-09-16 20:30:58 +00009158
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009159 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009160 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009161 Py_DECREF(str);
9162 Py_DECREF(substr);
9163 return result;
9164}
9165
Guido van Rossumd57fd912000-03-10 22:53:23 +00009166/* Apply fixfct filter to the Unicode object self and return a
9167 reference to the modified object */
9168
Alexander Belopolsky40018472011-02-26 01:02:56 +00009169static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009170fixup(PyObject *self,
9171 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009172{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009173 PyObject *u;
9174 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009175 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009176
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009177 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009178 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009179 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009180 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009181
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009182 /* fix functions return the new maximum character in a string,
9183 if the kind of the resulting unicode object does not change,
9184 everything is fine. Otherwise we need to change the string kind
9185 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009186 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009187
9188 if (maxchar_new == 0) {
9189 /* no changes */;
9190 if (PyUnicode_CheckExact(self)) {
9191 Py_DECREF(u);
9192 Py_INCREF(self);
9193 return self;
9194 }
9195 else
9196 return u;
9197 }
9198
Victor Stinnere6abb482012-05-02 01:15:40 +02009199 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009200
Victor Stinnereaab6042011-12-11 22:22:39 +01009201 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009202 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009203
9204 /* In case the maximum character changed, we need to
9205 convert the string to the new category. */
9206 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9207 if (v == NULL) {
9208 Py_DECREF(u);
9209 return NULL;
9210 }
9211 if (maxchar_new > maxchar_old) {
9212 /* If the maxchar increased so that the kind changed, not all
9213 characters are representable anymore and we need to fix the
9214 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009215 _PyUnicode_FastCopyCharacters(v, 0,
9216 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009217 maxchar_old = fixfct(v);
9218 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009219 }
9220 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009221 _PyUnicode_FastCopyCharacters(v, 0,
9222 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009223 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009224 Py_DECREF(u);
9225 assert(_PyUnicode_CheckConsistency(v, 1));
9226 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009227}
9228
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009229static PyObject *
9230ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009231{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009232 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9233 char *resdata, *data = PyUnicode_DATA(self);
9234 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009235
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009236 res = PyUnicode_New(len, 127);
9237 if (res == NULL)
9238 return NULL;
9239 resdata = PyUnicode_DATA(res);
9240 if (lower)
9241 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009242 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009243 _Py_bytes_upper(resdata, data, len);
9244 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009245}
9246
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009247static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009248handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009249{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009250 Py_ssize_t j;
9251 int final_sigma;
9252 Py_UCS4 c;
9253 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009254
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009255 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9256
9257 where ! is a negation and \p{xxx} is a character with property xxx.
9258 */
9259 for (j = i - 1; j >= 0; j--) {
9260 c = PyUnicode_READ(kind, data, j);
9261 if (!_PyUnicode_IsCaseIgnorable(c))
9262 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009263 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009264 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9265 if (final_sigma) {
9266 for (j = i + 1; j < length; j++) {
9267 c = PyUnicode_READ(kind, data, j);
9268 if (!_PyUnicode_IsCaseIgnorable(c))
9269 break;
9270 }
9271 final_sigma = j == length || !_PyUnicode_IsCased(c);
9272 }
9273 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009274}
9275
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009276static int
9277lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9278 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009279{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009280 /* Obscure special case. */
9281 if (c == 0x3A3) {
9282 mapped[0] = handle_capital_sigma(kind, data, length, i);
9283 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009284 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009285 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009286}
9287
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009288static Py_ssize_t
9289do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009290{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009291 Py_ssize_t i, k = 0;
9292 int n_res, j;
9293 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009294
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009295 c = PyUnicode_READ(kind, data, 0);
9296 n_res = _PyUnicode_ToUpperFull(c, mapped);
9297 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009298 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009299 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009300 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009301 for (i = 1; i < length; i++) {
9302 c = PyUnicode_READ(kind, data, i);
9303 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9304 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009305 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009306 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009307 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009308 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009309 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009310}
9311
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009312static Py_ssize_t
9313do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9314 Py_ssize_t i, k = 0;
9315
9316 for (i = 0; i < length; i++) {
9317 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9318 int n_res, j;
9319 if (Py_UNICODE_ISUPPER(c)) {
9320 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9321 }
9322 else if (Py_UNICODE_ISLOWER(c)) {
9323 n_res = _PyUnicode_ToUpperFull(c, mapped);
9324 }
9325 else {
9326 n_res = 1;
9327 mapped[0] = c;
9328 }
9329 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009330 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009331 res[k++] = mapped[j];
9332 }
9333 }
9334 return k;
9335}
9336
9337static Py_ssize_t
9338do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9339 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009340{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009341 Py_ssize_t i, k = 0;
9342
9343 for (i = 0; i < length; i++) {
9344 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9345 int n_res, j;
9346 if (lower)
9347 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9348 else
9349 n_res = _PyUnicode_ToUpperFull(c, mapped);
9350 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009351 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009352 res[k++] = mapped[j];
9353 }
9354 }
9355 return k;
9356}
9357
9358static Py_ssize_t
9359do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9360{
9361 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9362}
9363
9364static Py_ssize_t
9365do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9366{
9367 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9368}
9369
Benjamin Petersone51757f2012-01-12 21:10:29 -05009370static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009371do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9372{
9373 Py_ssize_t i, k = 0;
9374
9375 for (i = 0; i < length; i++) {
9376 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9377 Py_UCS4 mapped[3];
9378 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9379 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009380 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009381 res[k++] = mapped[j];
9382 }
9383 }
9384 return k;
9385}
9386
9387static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009388do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9389{
9390 Py_ssize_t i, k = 0;
9391 int previous_is_cased;
9392
9393 previous_is_cased = 0;
9394 for (i = 0; i < length; i++) {
9395 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9396 Py_UCS4 mapped[3];
9397 int n_res, j;
9398
9399 if (previous_is_cased)
9400 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9401 else
9402 n_res = _PyUnicode_ToTitleFull(c, mapped);
9403
9404 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009405 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009406 res[k++] = mapped[j];
9407 }
9408
9409 previous_is_cased = _PyUnicode_IsCased(c);
9410 }
9411 return k;
9412}
9413
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009414static PyObject *
9415case_operation(PyObject *self,
9416 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9417{
9418 PyObject *res = NULL;
9419 Py_ssize_t length, newlength = 0;
9420 int kind, outkind;
9421 void *data, *outdata;
9422 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9423
Benjamin Petersoneea48462012-01-16 14:28:50 -05009424 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009425
9426 kind = PyUnicode_KIND(self);
9427 data = PyUnicode_DATA(self);
9428 length = PyUnicode_GET_LENGTH(self);
9429 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9430 if (tmp == NULL)
9431 return PyErr_NoMemory();
9432 newlength = perform(kind, data, length, tmp, &maxchar);
9433 res = PyUnicode_New(newlength, maxchar);
9434 if (res == NULL)
9435 goto leave;
9436 tmpend = tmp + newlength;
9437 outdata = PyUnicode_DATA(res);
9438 outkind = PyUnicode_KIND(res);
9439 switch (outkind) {
9440 case PyUnicode_1BYTE_KIND:
9441 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9442 break;
9443 case PyUnicode_2BYTE_KIND:
9444 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9445 break;
9446 case PyUnicode_4BYTE_KIND:
9447 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9448 break;
9449 default:
9450 assert(0);
9451 break;
9452 }
9453 leave:
9454 PyMem_FREE(tmp);
9455 return res;
9456}
9457
Tim Peters8ce9f162004-08-27 01:49:32 +00009458PyObject *
9459PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009460{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009461 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009462 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009463 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009464 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009465 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9466 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009467 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009468 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009469 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009470 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009471 int use_memcpy;
9472 unsigned char *res_data = NULL, *sep_data = NULL;
9473 PyObject *last_obj;
9474 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009475
Tim Peters05eba1f2004-08-27 21:32:02 +00009476 fseq = PySequence_Fast(seq, "");
9477 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009478 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009479 }
9480
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009481 /* NOTE: the following code can't call back into Python code,
9482 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009483 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009484
Tim Peters05eba1f2004-08-27 21:32:02 +00009485 seqlen = PySequence_Fast_GET_SIZE(fseq);
9486 /* If empty sequence, return u"". */
9487 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009488 Py_DECREF(fseq);
9489 Py_INCREF(unicode_empty);
9490 res = unicode_empty;
9491 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009492 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009493
Tim Peters05eba1f2004-08-27 21:32:02 +00009494 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009495 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009496 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009497 if (seqlen == 1) {
9498 if (PyUnicode_CheckExact(items[0])) {
9499 res = items[0];
9500 Py_INCREF(res);
9501 Py_DECREF(fseq);
9502 return res;
9503 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009504 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009505 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009506 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009507 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009508 /* Set up sep and seplen */
9509 if (separator == NULL) {
9510 /* fall back to a blank space separator */
9511 sep = PyUnicode_FromOrdinal(' ');
9512 if (!sep)
9513 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009514 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009515 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009516 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009517 else {
9518 if (!PyUnicode_Check(separator)) {
9519 PyErr_Format(PyExc_TypeError,
9520 "separator: expected str instance,"
9521 " %.80s found",
9522 Py_TYPE(separator)->tp_name);
9523 goto onError;
9524 }
9525 if (PyUnicode_READY(separator))
9526 goto onError;
9527 sep = separator;
9528 seplen = PyUnicode_GET_LENGTH(separator);
9529 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9530 /* inc refcount to keep this code path symmetric with the
9531 above case of a blank separator */
9532 Py_INCREF(sep);
9533 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009534 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009535 }
9536
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009537 /* There are at least two things to join, or else we have a subclass
9538 * of str in the sequence.
9539 * Do a pre-pass to figure out the total amount of space we'll
9540 * need (sz), and see whether all argument are strings.
9541 */
9542 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009543#ifdef Py_DEBUG
9544 use_memcpy = 0;
9545#else
9546 use_memcpy = 1;
9547#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009548 for (i = 0; i < seqlen; i++) {
9549 const Py_ssize_t old_sz = sz;
9550 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009551 if (!PyUnicode_Check(item)) {
9552 PyErr_Format(PyExc_TypeError,
9553 "sequence item %zd: expected str instance,"
9554 " %.80s found",
9555 i, Py_TYPE(item)->tp_name);
9556 goto onError;
9557 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009558 if (PyUnicode_READY(item) == -1)
9559 goto onError;
9560 sz += PyUnicode_GET_LENGTH(item);
9561 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnere6abb482012-05-02 01:15:40 +02009562 maxchar = MAX_MAXCHAR(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009563 if (i != 0)
9564 sz += seplen;
9565 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9566 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009567 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009568 goto onError;
9569 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009570 if (use_memcpy && last_obj != NULL) {
9571 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9572 use_memcpy = 0;
9573 }
9574 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009575 }
Tim Petersced69f82003-09-16 20:30:58 +00009576
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009577 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009578 if (res == NULL)
9579 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009580
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009581 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009582#ifdef Py_DEBUG
9583 use_memcpy = 0;
9584#else
9585 if (use_memcpy) {
9586 res_data = PyUnicode_1BYTE_DATA(res);
9587 kind = PyUnicode_KIND(res);
9588 if (seplen != 0)
9589 sep_data = PyUnicode_1BYTE_DATA(sep);
9590 }
9591#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009592 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009593 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009594 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009595 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009596 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009597 if (use_memcpy) {
9598 Py_MEMCPY(res_data,
9599 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009600 kind * seplen);
9601 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009602 }
9603 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009604 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009605 res_offset += seplen;
9606 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009607 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009608 itemlen = PyUnicode_GET_LENGTH(item);
9609 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009610 if (use_memcpy) {
9611 Py_MEMCPY(res_data,
9612 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009613 kind * itemlen);
9614 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009615 }
9616 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009617 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009618 res_offset += itemlen;
9619 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009620 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009621 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009622 if (use_memcpy)
9623 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009624 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009625 else
9626 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009627
Tim Peters05eba1f2004-08-27 21:32:02 +00009628 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009629 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009630 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009631 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009632
Benjamin Peterson29060642009-01-31 22:14:21 +00009633 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009634 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009635 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009636 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009637 return NULL;
9638}
9639
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009640#define FILL(kind, data, value, start, length) \
9641 do { \
9642 Py_ssize_t i_ = 0; \
9643 assert(kind != PyUnicode_WCHAR_KIND); \
9644 switch ((kind)) { \
9645 case PyUnicode_1BYTE_KIND: { \
9646 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +02009647 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009648 break; \
9649 } \
9650 case PyUnicode_2BYTE_KIND: { \
9651 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9652 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9653 break; \
9654 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009655 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009656 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9657 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9658 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009659 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009660 } \
9661 } \
9662 } while (0)
9663
Victor Stinnerd3f08822012-05-29 12:57:52 +02009664void
9665_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9666 Py_UCS4 fill_char)
9667{
9668 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9669 const void *data = PyUnicode_DATA(unicode);
9670 assert(PyUnicode_IS_READY(unicode));
9671 assert(unicode_modifiable(unicode));
9672 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9673 assert(start >= 0);
9674 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9675 FILL(kind, data, fill_char, start, length);
9676}
9677
Victor Stinner3fe55312012-01-04 00:33:50 +01009678Py_ssize_t
9679PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9680 Py_UCS4 fill_char)
9681{
9682 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +01009683
9684 if (!PyUnicode_Check(unicode)) {
9685 PyErr_BadInternalCall();
9686 return -1;
9687 }
9688 if (PyUnicode_READY(unicode) == -1)
9689 return -1;
9690 if (unicode_check_modifiable(unicode))
9691 return -1;
9692
Victor Stinnerd3f08822012-05-29 12:57:52 +02009693 if (start < 0) {
9694 PyErr_SetString(PyExc_IndexError, "string index out of range");
9695 return -1;
9696 }
Victor Stinner3fe55312012-01-04 00:33:50 +01009697 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9698 PyErr_SetString(PyExc_ValueError,
9699 "fill character is bigger than "
9700 "the string maximum character");
9701 return -1;
9702 }
9703
9704 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9705 length = Py_MIN(maxlen, length);
9706 if (length <= 0)
9707 return 0;
9708
Victor Stinnerd3f08822012-05-29 12:57:52 +02009709 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +01009710 return length;
9711}
9712
Victor Stinner9310abb2011-10-05 00:59:23 +02009713static PyObject *
9714pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009715 Py_ssize_t left,
9716 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009717 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009718{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009719 PyObject *u;
9720 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009721 int kind;
9722 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009723
9724 if (left < 0)
9725 left = 0;
9726 if (right < 0)
9727 right = 0;
9728
Victor Stinnerc4b49542011-12-11 22:44:26 +01009729 if (left == 0 && right == 0)
9730 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009731
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009732 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9733 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009734 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9735 return NULL;
9736 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009737 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02009738 maxchar = MAX_MAXCHAR(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009739 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009740 if (!u)
9741 return NULL;
9742
9743 kind = PyUnicode_KIND(u);
9744 data = PyUnicode_DATA(u);
9745 if (left)
9746 FILL(kind, data, fill, 0, left);
9747 if (right)
9748 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +02009749 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009750 assert(_PyUnicode_CheckConsistency(u, 1));
9751 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009752}
9753
Alexander Belopolsky40018472011-02-26 01:02:56 +00009754PyObject *
9755PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009756{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009757 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009758
9759 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009760 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009761 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -06009762 if (PyUnicode_READY(string) == -1) {
9763 Py_DECREF(string);
9764 return NULL;
9765 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009766
Benjamin Petersonead6b532011-12-20 17:23:42 -06009767 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009768 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009769 if (PyUnicode_IS_ASCII(string))
9770 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009771 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009772 PyUnicode_GET_LENGTH(string), keepends);
9773 else
9774 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009775 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009776 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009777 break;
9778 case PyUnicode_2BYTE_KIND:
9779 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009780 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009781 PyUnicode_GET_LENGTH(string), keepends);
9782 break;
9783 case PyUnicode_4BYTE_KIND:
9784 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009785 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009786 PyUnicode_GET_LENGTH(string), keepends);
9787 break;
9788 default:
9789 assert(0);
9790 list = 0;
9791 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009792 Py_DECREF(string);
9793 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009794}
9795
Alexander Belopolsky40018472011-02-26 01:02:56 +00009796static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009797split(PyObject *self,
9798 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009799 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009800{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009801 int kind1, kind2, kind;
9802 void *buf1, *buf2;
9803 Py_ssize_t len1, len2;
9804 PyObject* out;
9805
Guido van Rossumd57fd912000-03-10 22:53:23 +00009806 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009807 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009808
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009809 if (PyUnicode_READY(self) == -1)
9810 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009811
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009812 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009813 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009814 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009815 if (PyUnicode_IS_ASCII(self))
9816 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009817 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009818 PyUnicode_GET_LENGTH(self), maxcount
9819 );
9820 else
9821 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009822 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009823 PyUnicode_GET_LENGTH(self), maxcount
9824 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009825 case PyUnicode_2BYTE_KIND:
9826 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009827 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009828 PyUnicode_GET_LENGTH(self), maxcount
9829 );
9830 case PyUnicode_4BYTE_KIND:
9831 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009832 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009833 PyUnicode_GET_LENGTH(self), maxcount
9834 );
9835 default:
9836 assert(0);
9837 return NULL;
9838 }
9839
9840 if (PyUnicode_READY(substring) == -1)
9841 return NULL;
9842
9843 kind1 = PyUnicode_KIND(self);
9844 kind2 = PyUnicode_KIND(substring);
9845 kind = kind1 > kind2 ? kind1 : kind2;
9846 buf1 = PyUnicode_DATA(self);
9847 buf2 = PyUnicode_DATA(substring);
9848 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009849 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009850 if (!buf1)
9851 return NULL;
9852 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009853 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009854 if (!buf2) {
9855 if (kind1 != kind) PyMem_Free(buf1);
9856 return NULL;
9857 }
9858 len1 = PyUnicode_GET_LENGTH(self);
9859 len2 = PyUnicode_GET_LENGTH(substring);
9860
Benjamin Petersonead6b532011-12-20 17:23:42 -06009861 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009862 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009863 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9864 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009865 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009866 else
9867 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009868 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009869 break;
9870 case PyUnicode_2BYTE_KIND:
9871 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009872 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009873 break;
9874 case PyUnicode_4BYTE_KIND:
9875 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009876 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009877 break;
9878 default:
9879 out = NULL;
9880 }
9881 if (kind1 != kind)
9882 PyMem_Free(buf1);
9883 if (kind2 != kind)
9884 PyMem_Free(buf2);
9885 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009886}
9887
Alexander Belopolsky40018472011-02-26 01:02:56 +00009888static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009889rsplit(PyObject *self,
9890 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009891 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009892{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009893 int kind1, kind2, kind;
9894 void *buf1, *buf2;
9895 Py_ssize_t len1, len2;
9896 PyObject* out;
9897
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009898 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009899 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009900
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009901 if (PyUnicode_READY(self) == -1)
9902 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009903
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009904 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009905 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009906 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009907 if (PyUnicode_IS_ASCII(self))
9908 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009909 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009910 PyUnicode_GET_LENGTH(self), maxcount
9911 );
9912 else
9913 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009914 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009915 PyUnicode_GET_LENGTH(self), maxcount
9916 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009917 case PyUnicode_2BYTE_KIND:
9918 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009919 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009920 PyUnicode_GET_LENGTH(self), maxcount
9921 );
9922 case PyUnicode_4BYTE_KIND:
9923 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009924 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009925 PyUnicode_GET_LENGTH(self), maxcount
9926 );
9927 default:
9928 assert(0);
9929 return NULL;
9930 }
9931
9932 if (PyUnicode_READY(substring) == -1)
9933 return NULL;
9934
9935 kind1 = PyUnicode_KIND(self);
9936 kind2 = PyUnicode_KIND(substring);
9937 kind = kind1 > kind2 ? kind1 : kind2;
9938 buf1 = PyUnicode_DATA(self);
9939 buf2 = PyUnicode_DATA(substring);
9940 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009941 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009942 if (!buf1)
9943 return NULL;
9944 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009945 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009946 if (!buf2) {
9947 if (kind1 != kind) PyMem_Free(buf1);
9948 return NULL;
9949 }
9950 len1 = PyUnicode_GET_LENGTH(self);
9951 len2 = PyUnicode_GET_LENGTH(substring);
9952
Benjamin Petersonead6b532011-12-20 17:23:42 -06009953 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009954 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009955 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9956 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009957 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009958 else
9959 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009960 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009961 break;
9962 case PyUnicode_2BYTE_KIND:
9963 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009964 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009965 break;
9966 case PyUnicode_4BYTE_KIND:
9967 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009968 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009969 break;
9970 default:
9971 out = NULL;
9972 }
9973 if (kind1 != kind)
9974 PyMem_Free(buf1);
9975 if (kind2 != kind)
9976 PyMem_Free(buf2);
9977 return out;
9978}
9979
9980static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009981anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9982 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009983{
Benjamin Petersonead6b532011-12-20 17:23:42 -06009984 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009985 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009986 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9987 return asciilib_find(buf1, len1, buf2, len2, offset);
9988 else
9989 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009990 case PyUnicode_2BYTE_KIND:
9991 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9992 case PyUnicode_4BYTE_KIND:
9993 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9994 }
9995 assert(0);
9996 return -1;
9997}
9998
9999static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010000anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10001 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010002{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010003 switch (kind) {
10004 case PyUnicode_1BYTE_KIND:
10005 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10006 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10007 else
10008 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10009 case PyUnicode_2BYTE_KIND:
10010 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10011 case PyUnicode_4BYTE_KIND:
10012 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10013 }
10014 assert(0);
10015 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010016}
10017
Alexander Belopolsky40018472011-02-26 01:02:56 +000010018static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010019replace(PyObject *self, PyObject *str1,
10020 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010021{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010022 PyObject *u;
10023 char *sbuf = PyUnicode_DATA(self);
10024 char *buf1 = PyUnicode_DATA(str1);
10025 char *buf2 = PyUnicode_DATA(str2);
10026 int srelease = 0, release1 = 0, release2 = 0;
10027 int skind = PyUnicode_KIND(self);
10028 int kind1 = PyUnicode_KIND(str1);
10029 int kind2 = PyUnicode_KIND(str2);
10030 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10031 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10032 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010033 int mayshrink;
10034 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010035
10036 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010037 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010038 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010039 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010040
Victor Stinner59de0ee2011-10-07 10:01:28 +020010041 if (str1 == str2)
10042 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010043 if (skind < kind1)
10044 /* substring too wide to be present */
10045 goto nothing;
10046
Victor Stinner49a0a212011-10-12 23:46:10 +020010047 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10048 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10049 /* Replacing str1 with str2 may cause a maxchar reduction in the
10050 result string. */
10051 mayshrink = (maxchar_str2 < maxchar);
Victor Stinnere6abb482012-05-02 01:15:40 +020010052 maxchar = MAX_MAXCHAR(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010053
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010054 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010055 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010056 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010057 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010058 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010059 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010060 Py_UCS4 u1, u2;
10061 int rkind;
Victor Stinnerf6441102011-12-18 02:43:08 +010010062 Py_ssize_t index, pos;
10063 char *src;
10064
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010065 u1 = PyUnicode_READ_CHAR(str1, 0);
Victor Stinnerf6441102011-12-18 02:43:08 +010010066 pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1);
10067 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010068 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010069 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010070 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010071 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010072 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010073 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010074 rkind = PyUnicode_KIND(u);
Victor Stinnerf6441102011-12-18 02:43:08 +010010075
10076 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2);
10077 index = 0;
10078 src = sbuf;
10079 while (--maxcount)
10080 {
10081 pos++;
10082 src += pos * PyUnicode_KIND(self);
10083 slen -= pos;
10084 index += pos;
10085 pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1);
10086 if (pos < 0)
10087 break;
10088 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2);
10089 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010090 }
10091 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010092 int rkind = skind;
10093 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010094 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010095
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010096 if (kind1 < rkind) {
10097 /* widen substring */
10098 buf1 = _PyUnicode_AsKind(str1, rkind);
10099 if (!buf1) goto error;
10100 release1 = 1;
10101 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010102 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010103 if (i < 0)
10104 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010105 if (rkind > kind2) {
10106 /* widen replacement */
10107 buf2 = _PyUnicode_AsKind(str2, rkind);
10108 if (!buf2) goto error;
10109 release2 = 1;
10110 }
10111 else if (rkind < kind2) {
10112 /* widen self and buf1 */
10113 rkind = kind2;
10114 if (release1) PyMem_Free(buf1);
10115 sbuf = _PyUnicode_AsKind(self, rkind);
10116 if (!sbuf) goto error;
10117 srelease = 1;
10118 buf1 = _PyUnicode_AsKind(str1, rkind);
10119 if (!buf1) goto error;
10120 release1 = 1;
10121 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010122 u = PyUnicode_New(slen, maxchar);
10123 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010124 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010125 assert(PyUnicode_KIND(u) == rkind);
10126 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010127
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010128 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010129 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010130 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010131 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010132 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010133 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010134
10135 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010136 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010137 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010138 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010139 if (i == -1)
10140 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010141 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010142 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010143 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010144 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010145 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010146 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010147 }
10148 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010149 Py_ssize_t n, i, j, ires;
10150 Py_ssize_t product, new_size;
10151 int rkind = skind;
10152 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010153
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010154 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010155 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010156 buf1 = _PyUnicode_AsKind(str1, rkind);
10157 if (!buf1) goto error;
10158 release1 = 1;
10159 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010160 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010161 if (n == 0)
10162 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010163 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010164 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010165 buf2 = _PyUnicode_AsKind(str2, rkind);
10166 if (!buf2) goto error;
10167 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010168 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010169 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010170 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010171 rkind = kind2;
10172 sbuf = _PyUnicode_AsKind(self, rkind);
10173 if (!sbuf) goto error;
10174 srelease = 1;
10175 if (release1) PyMem_Free(buf1);
10176 buf1 = _PyUnicode_AsKind(str1, rkind);
10177 if (!buf1) goto error;
10178 release1 = 1;
10179 }
10180 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10181 PyUnicode_GET_LENGTH(str1))); */
10182 product = n * (len2-len1);
10183 if ((product / (len2-len1)) != n) {
10184 PyErr_SetString(PyExc_OverflowError,
10185 "replace string is too long");
10186 goto error;
10187 }
10188 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +020010189 if (new_size == 0) {
10190 Py_INCREF(unicode_empty);
10191 u = unicode_empty;
10192 goto done;
10193 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010194 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10195 PyErr_SetString(PyExc_OverflowError,
10196 "replace string is too long");
10197 goto error;
10198 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010199 u = PyUnicode_New(new_size, maxchar);
10200 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010201 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010202 assert(PyUnicode_KIND(u) == rkind);
10203 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010204 ires = i = 0;
10205 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010206 while (n-- > 0) {
10207 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010208 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010209 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010210 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010211 if (j == -1)
10212 break;
10213 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010214 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010215 memcpy(res + rkind * ires,
10216 sbuf + rkind * i,
10217 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010218 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010219 }
10220 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010221 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010222 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010223 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010224 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010225 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010226 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010227 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010228 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010229 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010230 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010231 memcpy(res + rkind * ires,
10232 sbuf + rkind * i,
10233 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010234 }
10235 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010236 /* interleave */
10237 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010238 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010239 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010240 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010241 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010242 if (--n <= 0)
10243 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010244 memcpy(res + rkind * ires,
10245 sbuf + rkind * i,
10246 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010247 ires++;
10248 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010249 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010250 memcpy(res + rkind * ires,
10251 sbuf + rkind * i,
10252 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010253 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010254 }
10255
10256 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010257 unicode_adjust_maxchar(&u);
10258 if (u == NULL)
10259 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010260 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010261
10262 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010263 if (srelease)
10264 PyMem_FREE(sbuf);
10265 if (release1)
10266 PyMem_FREE(buf1);
10267 if (release2)
10268 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010269 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010270 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010271
Benjamin Peterson29060642009-01-31 22:14:21 +000010272 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010273 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010274 if (srelease)
10275 PyMem_FREE(sbuf);
10276 if (release1)
10277 PyMem_FREE(buf1);
10278 if (release2)
10279 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010280 return unicode_result_unchanged(self);
10281
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010282 error:
10283 if (srelease && sbuf)
10284 PyMem_FREE(sbuf);
10285 if (release1 && buf1)
10286 PyMem_FREE(buf1);
10287 if (release2 && buf2)
10288 PyMem_FREE(buf2);
10289 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010290}
10291
10292/* --- Unicode Object Methods --------------------------------------------- */
10293
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010294PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010295 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010296\n\
10297Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010298characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010299
10300static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010301unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010302{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010303 if (PyUnicode_READY(self) == -1)
10304 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010305 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010306}
10307
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010308PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010309 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010310\n\
10311Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010312have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010313
10314static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010315unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010316{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010317 if (PyUnicode_READY(self) == -1)
10318 return NULL;
10319 if (PyUnicode_GET_LENGTH(self) == 0)
10320 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010321 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010322}
10323
Benjamin Petersond5890c82012-01-14 13:23:30 -050010324PyDoc_STRVAR(casefold__doc__,
10325 "S.casefold() -> str\n\
10326\n\
10327Return a version of S suitable for caseless comparisons.");
10328
10329static PyObject *
10330unicode_casefold(PyObject *self)
10331{
10332 if (PyUnicode_READY(self) == -1)
10333 return NULL;
10334 if (PyUnicode_IS_ASCII(self))
10335 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010336 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010337}
10338
10339
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010340/* Argument converter. Coerces to a single unicode character */
10341
10342static int
10343convert_uc(PyObject *obj, void *addr)
10344{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010345 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010346 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010347
Benjamin Peterson14339b62009-01-31 16:36:08 +000010348 uniobj = PyUnicode_FromObject(obj);
10349 if (uniobj == NULL) {
10350 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010351 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010352 return 0;
10353 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010354 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010355 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010356 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010357 Py_DECREF(uniobj);
10358 return 0;
10359 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010360 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010361 Py_DECREF(uniobj);
10362 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010363}
10364
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010365PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010366 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010367\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010368Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010369done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010370
10371static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010372unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010373{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010374 Py_ssize_t marg, left;
10375 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010376 Py_UCS4 fillchar = ' ';
10377
Victor Stinnere9a29352011-10-01 02:14:59 +020010378 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010379 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010380
Benjamin Petersonbac79492012-01-14 13:34:47 -050010381 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010382 return NULL;
10383
Victor Stinnerc4b49542011-12-11 22:44:26 +010010384 if (PyUnicode_GET_LENGTH(self) >= width)
10385 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010386
Victor Stinnerc4b49542011-12-11 22:44:26 +010010387 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010388 left = marg / 2 + (marg & width & 1);
10389
Victor Stinner9310abb2011-10-05 00:59:23 +020010390 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010391}
10392
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010393/* This function assumes that str1 and str2 are readied by the caller. */
10394
Marc-André Lemburge5034372000-08-08 08:04:29 +000010395static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010396unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010397{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010398 int kind1, kind2;
10399 void *data1, *data2;
10400 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010401
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010402 kind1 = PyUnicode_KIND(str1);
10403 kind2 = PyUnicode_KIND(str2);
10404 data1 = PyUnicode_DATA(str1);
10405 data2 = PyUnicode_DATA(str2);
10406 len1 = PyUnicode_GET_LENGTH(str1);
10407 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010408
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010409 for (i = 0; i < len1 && i < len2; ++i) {
10410 Py_UCS4 c1, c2;
10411 c1 = PyUnicode_READ(kind1, data1, i);
10412 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010413
10414 if (c1 != c2)
10415 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010416 }
10417
10418 return (len1 < len2) ? -1 : (len1 != len2);
10419}
10420
Alexander Belopolsky40018472011-02-26 01:02:56 +000010421int
10422PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010423{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010424 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10425 if (PyUnicode_READY(left) == -1 ||
10426 PyUnicode_READY(right) == -1)
10427 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010428 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010429 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010430 PyErr_Format(PyExc_TypeError,
10431 "Can't compare %.100s and %.100s",
10432 left->ob_type->tp_name,
10433 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010434 return -1;
10435}
10436
Martin v. Löwis5b222132007-06-10 09:51:05 +000010437int
10438PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10439{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010440 Py_ssize_t i;
10441 int kind;
10442 void *data;
10443 Py_UCS4 chr;
10444
Victor Stinner910337b2011-10-03 03:20:16 +020010445 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010446 if (PyUnicode_READY(uni) == -1)
10447 return -1;
10448 kind = PyUnicode_KIND(uni);
10449 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010450 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010451 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10452 if (chr != str[i])
10453 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010454 /* This check keeps Python strings that end in '\0' from comparing equal
10455 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010456 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010457 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010458 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010459 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010460 return 0;
10461}
10462
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010463
Benjamin Peterson29060642009-01-31 22:14:21 +000010464#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010465 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010466
Alexander Belopolsky40018472011-02-26 01:02:56 +000010467PyObject *
10468PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010469{
10470 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010471
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010472 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10473 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010474 if (PyUnicode_READY(left) == -1 ||
10475 PyUnicode_READY(right) == -1)
10476 return NULL;
10477 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10478 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010479 if (op == Py_EQ) {
10480 Py_INCREF(Py_False);
10481 return Py_False;
10482 }
10483 if (op == Py_NE) {
10484 Py_INCREF(Py_True);
10485 return Py_True;
10486 }
10487 }
10488 if (left == right)
10489 result = 0;
10490 else
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010491 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010492
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010493 /* Convert the return value to a Boolean */
10494 switch (op) {
10495 case Py_EQ:
10496 v = TEST_COND(result == 0);
10497 break;
10498 case Py_NE:
10499 v = TEST_COND(result != 0);
10500 break;
10501 case Py_LE:
10502 v = TEST_COND(result <= 0);
10503 break;
10504 case Py_GE:
10505 v = TEST_COND(result >= 0);
10506 break;
10507 case Py_LT:
10508 v = TEST_COND(result == -1);
10509 break;
10510 case Py_GT:
10511 v = TEST_COND(result == 1);
10512 break;
10513 default:
10514 PyErr_BadArgument();
10515 return NULL;
10516 }
10517 Py_INCREF(v);
10518 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010519 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010520
Brian Curtindfc80e32011-08-10 20:28:54 -050010521 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010522}
10523
Alexander Belopolsky40018472011-02-26 01:02:56 +000010524int
10525PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010526{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010527 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010528 int kind1, kind2, kind;
10529 void *buf1, *buf2;
10530 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010531 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010532
10533 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010534 sub = PyUnicode_FromObject(element);
10535 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010536 PyErr_Format(PyExc_TypeError,
10537 "'in <string>' requires string as left operand, not %s",
10538 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010539 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010540 }
10541
Thomas Wouters477c8d52006-05-27 19:21:47 +000010542 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010543 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010544 Py_DECREF(sub);
10545 return -1;
10546 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060010547 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
10548 Py_DECREF(sub);
10549 Py_DECREF(str);
10550 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010551
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010552 kind1 = PyUnicode_KIND(str);
10553 kind2 = PyUnicode_KIND(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010554 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010555 buf1 = PyUnicode_DATA(str);
10556 buf2 = PyUnicode_DATA(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010557 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +020010558 if (kind2 > kind) {
10559 Py_DECREF(sub);
10560 Py_DECREF(str);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010561 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +020010562 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010010563 buf2 = _PyUnicode_AsKind(sub, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010564 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010565 if (!buf2) {
10566 Py_DECREF(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010567 Py_DECREF(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010568 return -1;
10569 }
10570 len1 = PyUnicode_GET_LENGTH(str);
10571 len2 = PyUnicode_GET_LENGTH(sub);
10572
Benjamin Petersonead6b532011-12-20 17:23:42 -060010573 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010574 case PyUnicode_1BYTE_KIND:
10575 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10576 break;
10577 case PyUnicode_2BYTE_KIND:
10578 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10579 break;
10580 case PyUnicode_4BYTE_KIND:
10581 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10582 break;
10583 default:
10584 result = -1;
10585 assert(0);
10586 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010587
10588 Py_DECREF(str);
10589 Py_DECREF(sub);
10590
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010591 if (kind2 != kind)
10592 PyMem_Free(buf2);
10593
Guido van Rossum403d68b2000-03-13 15:55:09 +000010594 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010595}
10596
Guido van Rossumd57fd912000-03-10 22:53:23 +000010597/* Concat to string or Unicode object giving a new Unicode object. */
10598
Alexander Belopolsky40018472011-02-26 01:02:56 +000010599PyObject *
10600PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010601{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010602 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010603 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010604 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010605
10606 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010607 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010608 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010609 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010610 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010611 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010612 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010613
10614 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010615 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010616 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010617 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010618 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010619 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010620 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010621 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010622 }
10623
Victor Stinner488fa492011-12-12 00:01:39 +010010624 u_len = PyUnicode_GET_LENGTH(u);
10625 v_len = PyUnicode_GET_LENGTH(v);
10626 if (u_len > PY_SSIZE_T_MAX - v_len) {
10627 PyErr_SetString(PyExc_OverflowError,
10628 "strings are too large to concat");
10629 goto onError;
10630 }
10631 new_len = u_len + v_len;
10632
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010633 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010634 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Victor Stinnere6abb482012-05-02 01:15:40 +020010635 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010636
Guido van Rossumd57fd912000-03-10 22:53:23 +000010637 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010010638 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010639 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010640 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010641 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
10642 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010643 Py_DECREF(u);
10644 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010645 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010646 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010647
Benjamin Peterson29060642009-01-31 22:14:21 +000010648 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010649 Py_XDECREF(u);
10650 Py_XDECREF(v);
10651 return NULL;
10652}
10653
Walter Dörwald1ab83302007-05-18 17:15:44 +000010654void
Victor Stinner23e56682011-10-03 03:54:37 +020010655PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010656{
Victor Stinner23e56682011-10-03 03:54:37 +020010657 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010010658 Py_UCS4 maxchar, maxchar2;
10659 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020010660
10661 if (p_left == NULL) {
10662 if (!PyErr_Occurred())
10663 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010664 return;
10665 }
Victor Stinner23e56682011-10-03 03:54:37 +020010666 left = *p_left;
10667 if (right == NULL || !PyUnicode_Check(left)) {
10668 if (!PyErr_Occurred())
10669 PyErr_BadInternalCall();
10670 goto error;
10671 }
10672
Benjamin Petersonbac79492012-01-14 13:34:47 -050010673 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010674 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050010675 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010676 goto error;
10677
Victor Stinner488fa492011-12-12 00:01:39 +010010678 /* Shortcuts */
10679 if (left == unicode_empty) {
10680 Py_DECREF(left);
10681 Py_INCREF(right);
10682 *p_left = right;
10683 return;
10684 }
10685 if (right == unicode_empty)
10686 return;
10687
10688 left_len = PyUnicode_GET_LENGTH(left);
10689 right_len = PyUnicode_GET_LENGTH(right);
10690 if (left_len > PY_SSIZE_T_MAX - right_len) {
10691 PyErr_SetString(PyExc_OverflowError,
10692 "strings are too large to concat");
10693 goto error;
10694 }
10695 new_len = left_len + right_len;
10696
10697 if (unicode_modifiable(left)
10698 && PyUnicode_CheckExact(right)
10699 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020010700 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10701 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010702 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010703 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010010704 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10705 {
10706 /* append inplace */
10707 if (unicode_resize(p_left, new_len) != 0) {
10708 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10709 * deallocated so it cannot be put back into
10710 * 'variable'. The MemoryError is raised when there
10711 * is no value in 'variable', which might (very
10712 * remotely) be a cause of incompatibilities.
10713 */
10714 goto error;
Victor Stinner23e56682011-10-03 03:54:37 +020010715 }
Victor Stinner488fa492011-12-12 00:01:39 +010010716 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerd3f08822012-05-29 12:57:52 +020010717 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020010718 }
Victor Stinner488fa492011-12-12 00:01:39 +010010719 else {
10720 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
10721 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Victor Stinnere6abb482012-05-02 01:15:40 +020010722 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020010723
Victor Stinner488fa492011-12-12 00:01:39 +010010724 /* Concat the two Unicode strings */
10725 res = PyUnicode_New(new_len, maxchar);
10726 if (res == NULL)
10727 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010728 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
10729 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010010730 Py_DECREF(left);
10731 *p_left = res;
10732 }
10733 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010734 return;
10735
10736error:
Victor Stinner488fa492011-12-12 00:01:39 +010010737 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010738}
10739
10740void
10741PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10742{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010743 PyUnicode_Append(pleft, right);
10744 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010745}
10746
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010747PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010748 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010749\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010750Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010751string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010752interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010753
10754static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010755unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010756{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010757 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010758 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010759 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010760 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010761 int kind1, kind2, kind;
10762 void *buf1, *buf2;
10763 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010764
Jesus Ceaac451502011-04-20 17:09:23 +020010765 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10766 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010767 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010768
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010769 kind1 = PyUnicode_KIND(self);
10770 kind2 = PyUnicode_KIND(substring);
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040010771 if (kind2 > kind1)
10772 return PyLong_FromLong(0);
10773 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010774 buf1 = PyUnicode_DATA(self);
10775 buf2 = PyUnicode_DATA(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010776 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010777 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010778 if (!buf2) {
10779 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010780 return NULL;
10781 }
10782 len1 = PyUnicode_GET_LENGTH(self);
10783 len2 = PyUnicode_GET_LENGTH(substring);
10784
10785 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060010786 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010787 case PyUnicode_1BYTE_KIND:
10788 iresult = ucs1lib_count(
10789 ((Py_UCS1*)buf1) + start, end - start,
10790 buf2, len2, PY_SSIZE_T_MAX
10791 );
10792 break;
10793 case PyUnicode_2BYTE_KIND:
10794 iresult = ucs2lib_count(
10795 ((Py_UCS2*)buf1) + start, end - start,
10796 buf2, len2, PY_SSIZE_T_MAX
10797 );
10798 break;
10799 case PyUnicode_4BYTE_KIND:
10800 iresult = ucs4lib_count(
10801 ((Py_UCS4*)buf1) + start, end - start,
10802 buf2, len2, PY_SSIZE_T_MAX
10803 );
10804 break;
10805 default:
10806 assert(0); iresult = 0;
10807 }
10808
10809 result = PyLong_FromSsize_t(iresult);
10810
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010811 if (kind2 != kind)
10812 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010813
10814 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010815
Guido van Rossumd57fd912000-03-10 22:53:23 +000010816 return result;
10817}
10818
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010819PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010820 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010821\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010822Encode S using the codec registered for encoding. Default encoding\n\
10823is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010824handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010825a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10826'xmlcharrefreplace' as well as any other name registered with\n\
10827codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010828
10829static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010830unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010831{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010832 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010833 char *encoding = NULL;
10834 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010835
Benjamin Peterson308d6372009-09-18 21:42:35 +000010836 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10837 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010838 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010839 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010840}
10841
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010842PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010843 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010844\n\
10845Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010846If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010847
10848static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010849unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010850{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010851 Py_ssize_t i, j, line_pos, src_len, incr;
10852 Py_UCS4 ch;
10853 PyObject *u;
10854 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010855 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010856 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010857 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010858
10859 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010860 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010861
Antoine Pitrou22425222011-10-04 19:10:51 +020010862 if (PyUnicode_READY(self) == -1)
10863 return NULL;
10864
Thomas Wouters7e474022000-07-16 12:04:32 +000010865 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010866 src_len = PyUnicode_GET_LENGTH(self);
10867 i = j = line_pos = 0;
10868 kind = PyUnicode_KIND(self);
10869 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010870 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010871 for (; i < src_len; i++) {
10872 ch = PyUnicode_READ(kind, src_data, i);
10873 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010874 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010875 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010876 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010877 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010878 goto overflow;
10879 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010880 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010881 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010882 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010883 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010884 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010885 goto overflow;
10886 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010887 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010888 if (ch == '\n' || ch == '\r')
10889 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010890 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010891 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010010892 if (!found)
10893 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010894
Guido van Rossumd57fd912000-03-10 22:53:23 +000010895 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010896 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010897 if (!u)
10898 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010899 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010900
Antoine Pitroue71d5742011-10-04 15:55:09 +020010901 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010902
Antoine Pitroue71d5742011-10-04 15:55:09 +020010903 for (; i < src_len; i++) {
10904 ch = PyUnicode_READ(kind, src_data, i);
10905 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010906 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010907 incr = tabsize - (line_pos % tabsize);
10908 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010010909 FILL(kind, dest_data, ' ', j, incr);
10910 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010911 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010912 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010913 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010914 line_pos++;
10915 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010916 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010917 if (ch == '\n' || ch == '\r')
10918 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010919 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010920 }
10921 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010010922 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010923
Antoine Pitroue71d5742011-10-04 15:55:09 +020010924 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010925 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10926 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010927}
10928
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010929PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010930 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010931\n\
10932Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010933such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010934arguments start and end are interpreted as in slice notation.\n\
10935\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010936Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010937
10938static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010939unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010940{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010941 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010942 Py_ssize_t start;
10943 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010944 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010945
Jesus Ceaac451502011-04-20 17:09:23 +020010946 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10947 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010948 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010949
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010950 if (PyUnicode_READY(self) == -1)
10951 return NULL;
10952 if (PyUnicode_READY(substring) == -1)
10953 return NULL;
10954
Victor Stinner7931d9a2011-11-04 00:22:48 +010010955 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010956
10957 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010958
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010959 if (result == -2)
10960 return NULL;
10961
Christian Heimes217cfd12007-12-02 14:31:20 +000010962 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010963}
10964
10965static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010966unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010967{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020010968 void *data;
10969 enum PyUnicode_Kind kind;
10970 Py_UCS4 ch;
10971 PyObject *res;
10972
10973 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
10974 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010975 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020010976 }
10977 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
10978 PyErr_SetString(PyExc_IndexError, "string index out of range");
10979 return NULL;
10980 }
10981 kind = PyUnicode_KIND(self);
10982 data = PyUnicode_DATA(self);
10983 ch = PyUnicode_READ(kind, data, index);
10984 if (ch < 256)
10985 return get_latin1_char(ch);
10986
10987 res = PyUnicode_New(1, ch);
10988 if (res == NULL)
10989 return NULL;
10990 kind = PyUnicode_KIND(res);
10991 data = PyUnicode_DATA(res);
10992 PyUnicode_WRITE(kind, data, 0, ch);
10993 assert(_PyUnicode_CheckConsistency(res, 1));
10994 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010995}
10996
Guido van Rossumc2504932007-09-18 19:42:40 +000010997/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010998 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010999static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011000unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011001{
Guido van Rossumc2504932007-09-18 19:42:40 +000011002 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010011003 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011004
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011005#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011006 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011007#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011008 if (_PyUnicode_HASH(self) != -1)
11009 return _PyUnicode_HASH(self);
11010 if (PyUnicode_READY(self) == -1)
11011 return -1;
11012 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011013 /*
11014 We make the hash of the empty string be 0, rather than using
11015 (prefix ^ suffix), since this slightly obfuscates the hash secret
11016 */
11017 if (len == 0) {
11018 _PyUnicode_HASH(self) = 0;
11019 return 0;
11020 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011021
11022 /* The hash function as a macro, gets expanded three times below. */
Georg Brandl2fb477c2012-02-21 00:33:36 +010011023#define HASH(P) \
11024 x ^= (Py_uhash_t) *P << 7; \
11025 while (--len >= 0) \
11026 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011027
Georg Brandl2fb477c2012-02-21 00:33:36 +010011028 x = (Py_uhash_t) _Py_HashSecret.prefix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011029 switch (PyUnicode_KIND(self)) {
11030 case PyUnicode_1BYTE_KIND: {
11031 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11032 HASH(c);
11033 break;
11034 }
11035 case PyUnicode_2BYTE_KIND: {
11036 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11037 HASH(s);
11038 break;
11039 }
11040 default: {
11041 Py_UCS4 *l;
11042 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11043 "Impossible switch case in unicode_hash");
11044 l = PyUnicode_4BYTE_DATA(self);
11045 HASH(l);
11046 break;
11047 }
11048 }
Georg Brandl2fb477c2012-02-21 00:33:36 +010011049 x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
11050 x ^= (Py_uhash_t) _Py_HashSecret.suffix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011051
Guido van Rossumc2504932007-09-18 19:42:40 +000011052 if (x == -1)
11053 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011054 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011055 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011056}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011057#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011058
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011059PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011060 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011061\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011062Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011063
11064static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011065unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011066{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011067 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011068 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011069 Py_ssize_t start;
11070 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011071
Jesus Ceaac451502011-04-20 17:09:23 +020011072 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11073 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011074 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011075
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011076 if (PyUnicode_READY(self) == -1)
11077 return NULL;
11078 if (PyUnicode_READY(substring) == -1)
11079 return NULL;
11080
Victor Stinner7931d9a2011-11-04 00:22:48 +010011081 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011082
11083 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011084
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011085 if (result == -2)
11086 return NULL;
11087
Guido van Rossumd57fd912000-03-10 22:53:23 +000011088 if (result < 0) {
11089 PyErr_SetString(PyExc_ValueError, "substring not found");
11090 return NULL;
11091 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011092
Christian Heimes217cfd12007-12-02 14:31:20 +000011093 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011094}
11095
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011096PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011097 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011098\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011099Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011100at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011101
11102static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011103unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011104{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011105 Py_ssize_t i, length;
11106 int kind;
11107 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011108 int cased;
11109
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011110 if (PyUnicode_READY(self) == -1)
11111 return NULL;
11112 length = PyUnicode_GET_LENGTH(self);
11113 kind = PyUnicode_KIND(self);
11114 data = PyUnicode_DATA(self);
11115
Guido van Rossumd57fd912000-03-10 22:53:23 +000011116 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011117 if (length == 1)
11118 return PyBool_FromLong(
11119 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011120
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011121 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011122 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011123 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011124
Guido van Rossumd57fd912000-03-10 22:53:23 +000011125 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011126 for (i = 0; i < length; i++) {
11127 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011128
Benjamin Peterson29060642009-01-31 22:14:21 +000011129 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11130 return PyBool_FromLong(0);
11131 else if (!cased && Py_UNICODE_ISLOWER(ch))
11132 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011133 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011134 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011135}
11136
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011137PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011138 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011139\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011140Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011141at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011142
11143static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011144unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011145{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011146 Py_ssize_t i, length;
11147 int kind;
11148 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011149 int cased;
11150
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011151 if (PyUnicode_READY(self) == -1)
11152 return NULL;
11153 length = PyUnicode_GET_LENGTH(self);
11154 kind = PyUnicode_KIND(self);
11155 data = PyUnicode_DATA(self);
11156
Guido van Rossumd57fd912000-03-10 22:53:23 +000011157 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011158 if (length == 1)
11159 return PyBool_FromLong(
11160 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011161
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011162 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011163 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011164 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011165
Guido van Rossumd57fd912000-03-10 22:53:23 +000011166 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011167 for (i = 0; i < length; i++) {
11168 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011169
Benjamin Peterson29060642009-01-31 22:14:21 +000011170 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11171 return PyBool_FromLong(0);
11172 else if (!cased && Py_UNICODE_ISUPPER(ch))
11173 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011174 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011175 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011176}
11177
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011178PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011179 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011180\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011181Return True if S is a titlecased string and there is at least one\n\
11182character in S, i.e. upper- and titlecase characters may only\n\
11183follow uncased characters and lowercase characters only cased ones.\n\
11184Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011185
11186static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011187unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011188{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011189 Py_ssize_t i, length;
11190 int kind;
11191 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011192 int cased, previous_is_cased;
11193
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011194 if (PyUnicode_READY(self) == -1)
11195 return NULL;
11196 length = PyUnicode_GET_LENGTH(self);
11197 kind = PyUnicode_KIND(self);
11198 data = PyUnicode_DATA(self);
11199
Guido van Rossumd57fd912000-03-10 22:53:23 +000011200 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011201 if (length == 1) {
11202 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11203 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11204 (Py_UNICODE_ISUPPER(ch) != 0));
11205 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011206
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011207 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011208 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011209 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011210
Guido van Rossumd57fd912000-03-10 22:53:23 +000011211 cased = 0;
11212 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011213 for (i = 0; i < length; i++) {
11214 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011215
Benjamin Peterson29060642009-01-31 22:14:21 +000011216 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11217 if (previous_is_cased)
11218 return PyBool_FromLong(0);
11219 previous_is_cased = 1;
11220 cased = 1;
11221 }
11222 else if (Py_UNICODE_ISLOWER(ch)) {
11223 if (!previous_is_cased)
11224 return PyBool_FromLong(0);
11225 previous_is_cased = 1;
11226 cased = 1;
11227 }
11228 else
11229 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011230 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011231 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011232}
11233
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011234PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011235 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011236\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011237Return True if all characters in S are whitespace\n\
11238and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011239
11240static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011241unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011242{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011243 Py_ssize_t i, length;
11244 int kind;
11245 void *data;
11246
11247 if (PyUnicode_READY(self) == -1)
11248 return NULL;
11249 length = PyUnicode_GET_LENGTH(self);
11250 kind = PyUnicode_KIND(self);
11251 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011252
Guido van Rossumd57fd912000-03-10 22:53:23 +000011253 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011254 if (length == 1)
11255 return PyBool_FromLong(
11256 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011257
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011258 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011259 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011260 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011261
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011262 for (i = 0; i < length; i++) {
11263 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011264 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011265 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011266 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011267 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011268}
11269
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011270PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011271 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011272\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011273Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011274and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011275
11276static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011277unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011278{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011279 Py_ssize_t i, length;
11280 int kind;
11281 void *data;
11282
11283 if (PyUnicode_READY(self) == -1)
11284 return NULL;
11285 length = PyUnicode_GET_LENGTH(self);
11286 kind = PyUnicode_KIND(self);
11287 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011288
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011289 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011290 if (length == 1)
11291 return PyBool_FromLong(
11292 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011293
11294 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011295 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011296 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011297
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011298 for (i = 0; i < length; i++) {
11299 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011300 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011301 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011302 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011303}
11304
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011305PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011306 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011307\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011308Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011309and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011310
11311static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011312unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011313{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011314 int kind;
11315 void *data;
11316 Py_ssize_t len, i;
11317
11318 if (PyUnicode_READY(self) == -1)
11319 return NULL;
11320
11321 kind = PyUnicode_KIND(self);
11322 data = PyUnicode_DATA(self);
11323 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011324
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011325 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011326 if (len == 1) {
11327 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11328 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11329 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011330
11331 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011332 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011333 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011334
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011335 for (i = 0; i < len; i++) {
11336 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011337 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011338 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011339 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011340 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011341}
11342
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011343PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011344 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011345\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011346Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011347False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011348
11349static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011350unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011351{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011352 Py_ssize_t i, length;
11353 int kind;
11354 void *data;
11355
11356 if (PyUnicode_READY(self) == -1)
11357 return NULL;
11358 length = PyUnicode_GET_LENGTH(self);
11359 kind = PyUnicode_KIND(self);
11360 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011361
Guido van Rossumd57fd912000-03-10 22:53:23 +000011362 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011363 if (length == 1)
11364 return PyBool_FromLong(
11365 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011366
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011367 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011368 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011369 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011370
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011371 for (i = 0; i < length; i++) {
11372 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011373 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011374 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011375 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011376}
11377
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011378PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011379 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011380\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011381Return True if all characters in S are digits\n\
11382and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011383
11384static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011385unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011386{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011387 Py_ssize_t i, length;
11388 int kind;
11389 void *data;
11390
11391 if (PyUnicode_READY(self) == -1)
11392 return NULL;
11393 length = PyUnicode_GET_LENGTH(self);
11394 kind = PyUnicode_KIND(self);
11395 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011396
Guido van Rossumd57fd912000-03-10 22:53:23 +000011397 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011398 if (length == 1) {
11399 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11400 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11401 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011402
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011403 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011404 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011405 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011406
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011407 for (i = 0; i < length; i++) {
11408 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011409 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011410 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011411 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011412}
11413
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011414PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011415 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011416\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011417Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011418False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011419
11420static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011421unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011422{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011423 Py_ssize_t i, length;
11424 int kind;
11425 void *data;
11426
11427 if (PyUnicode_READY(self) == -1)
11428 return NULL;
11429 length = PyUnicode_GET_LENGTH(self);
11430 kind = PyUnicode_KIND(self);
11431 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011432
Guido van Rossumd57fd912000-03-10 22:53:23 +000011433 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011434 if (length == 1)
11435 return PyBool_FromLong(
11436 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011437
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011438 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011439 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011440 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011441
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011442 for (i = 0; i < length; i++) {
11443 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011444 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011445 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011446 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011447}
11448
Martin v. Löwis47383402007-08-15 07:32:56 +000011449int
11450PyUnicode_IsIdentifier(PyObject *self)
11451{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011452 int kind;
11453 void *data;
11454 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011455 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011456
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011457 if (PyUnicode_READY(self) == -1) {
11458 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011459 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011460 }
11461
11462 /* Special case for empty strings */
11463 if (PyUnicode_GET_LENGTH(self) == 0)
11464 return 0;
11465 kind = PyUnicode_KIND(self);
11466 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011467
11468 /* PEP 3131 says that the first character must be in
11469 XID_Start and subsequent characters in XID_Continue,
11470 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011471 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011472 letters, digits, underscore). However, given the current
11473 definition of XID_Start and XID_Continue, it is sufficient
11474 to check just for these, except that _ must be allowed
11475 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011476 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011477 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011478 return 0;
11479
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011480 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011481 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011482 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011483 return 1;
11484}
11485
11486PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011487 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011488\n\
11489Return True if S is a valid identifier according\n\
11490to the language definition.");
11491
11492static PyObject*
11493unicode_isidentifier(PyObject *self)
11494{
11495 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11496}
11497
Georg Brandl559e5d72008-06-11 18:37:52 +000011498PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011499 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011500\n\
11501Return True if all characters in S are considered\n\
11502printable in repr() or S is empty, False otherwise.");
11503
11504static PyObject*
11505unicode_isprintable(PyObject *self)
11506{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011507 Py_ssize_t i, length;
11508 int kind;
11509 void *data;
11510
11511 if (PyUnicode_READY(self) == -1)
11512 return NULL;
11513 length = PyUnicode_GET_LENGTH(self);
11514 kind = PyUnicode_KIND(self);
11515 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011516
11517 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011518 if (length == 1)
11519 return PyBool_FromLong(
11520 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011521
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011522 for (i = 0; i < length; i++) {
11523 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011524 Py_RETURN_FALSE;
11525 }
11526 }
11527 Py_RETURN_TRUE;
11528}
11529
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011530PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011531 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011532\n\
11533Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011534iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011535
11536static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011537unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011538{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011539 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011540}
11541
Martin v. Löwis18e16552006-02-15 17:27:45 +000011542static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011543unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011544{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011545 if (PyUnicode_READY(self) == -1)
11546 return -1;
11547 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011548}
11549
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011550PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011551 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011552\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011553Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011554done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011555
11556static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011557unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011558{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011559 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011560 Py_UCS4 fillchar = ' ';
11561
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011562 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011563 return NULL;
11564
Benjamin Petersonbac79492012-01-14 13:34:47 -050011565 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011566 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011567
Victor Stinnerc4b49542011-12-11 22:44:26 +010011568 if (PyUnicode_GET_LENGTH(self) >= width)
11569 return unicode_result_unchanged(self);
11570
11571 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011572}
11573
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011574PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011575 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011576\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011577Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011578
11579static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011580unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011581{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011582 if (PyUnicode_READY(self) == -1)
11583 return NULL;
11584 if (PyUnicode_IS_ASCII(self))
11585 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011586 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011587}
11588
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011589#define LEFTSTRIP 0
11590#define RIGHTSTRIP 1
11591#define BOTHSTRIP 2
11592
11593/* Arrays indexed by above */
11594static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11595
11596#define STRIPNAME(i) (stripformat[i]+3)
11597
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011598/* externally visible for str.strip(unicode) */
11599PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011600_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011601{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011602 void *data;
11603 int kind;
11604 Py_ssize_t i, j, len;
11605 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011606
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011607 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11608 return NULL;
11609
11610 kind = PyUnicode_KIND(self);
11611 data = PyUnicode_DATA(self);
11612 len = PyUnicode_GET_LENGTH(self);
11613 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11614 PyUnicode_DATA(sepobj),
11615 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011616
Benjamin Peterson14339b62009-01-31 16:36:08 +000011617 i = 0;
11618 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011619 while (i < len &&
11620 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011621 i++;
11622 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011623 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011624
Benjamin Peterson14339b62009-01-31 16:36:08 +000011625 j = len;
11626 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011627 do {
11628 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011629 } while (j >= i &&
11630 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011631 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011632 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011633
Victor Stinner7931d9a2011-11-04 00:22:48 +010011634 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011635}
11636
11637PyObject*
11638PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11639{
11640 unsigned char *data;
11641 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011642 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011643
Victor Stinnerde636f32011-10-01 03:55:54 +020011644 if (PyUnicode_READY(self) == -1)
11645 return NULL;
11646
Victor Stinner684d5fd2012-05-03 02:32:34 +020011647 length = PyUnicode_GET_LENGTH(self);
11648 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020011649
Victor Stinner684d5fd2012-05-03 02:32:34 +020011650 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011651 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011652
Victor Stinnerde636f32011-10-01 03:55:54 +020011653 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011654 PyErr_SetString(PyExc_IndexError, "string index out of range");
11655 return NULL;
11656 }
Victor Stinner684d5fd2012-05-03 02:32:34 +020011657 if (start >= length || end < start) {
Victor Stinner3a7f7972012-05-03 03:36:40 +020011658 Py_INCREF(unicode_empty);
11659 return unicode_empty;
Victor Stinner684d5fd2012-05-03 02:32:34 +020011660 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020011661
Victor Stinner684d5fd2012-05-03 02:32:34 +020011662 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020011663 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020011664 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020011665 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020011666 }
11667 else {
11668 kind = PyUnicode_KIND(self);
11669 data = PyUnicode_1BYTE_DATA(self);
11670 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011671 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011672 length);
11673 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011674}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011675
11676static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011677do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011678{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011679 int kind;
11680 void *data;
11681 Py_ssize_t len, i, j;
11682
11683 if (PyUnicode_READY(self) == -1)
11684 return NULL;
11685
11686 kind = PyUnicode_KIND(self);
11687 data = PyUnicode_DATA(self);
11688 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011689
Benjamin Peterson14339b62009-01-31 16:36:08 +000011690 i = 0;
11691 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011692 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011693 i++;
11694 }
11695 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011696
Benjamin Peterson14339b62009-01-31 16:36:08 +000011697 j = len;
11698 if (striptype != LEFTSTRIP) {
11699 do {
11700 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011701 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011702 j++;
11703 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011704
Victor Stinner7931d9a2011-11-04 00:22:48 +010011705 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011706}
11707
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011708
11709static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011710do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011711{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011712 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011713
Benjamin Peterson14339b62009-01-31 16:36:08 +000011714 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11715 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011716
Benjamin Peterson14339b62009-01-31 16:36:08 +000011717 if (sep != NULL && sep != Py_None) {
11718 if (PyUnicode_Check(sep))
11719 return _PyUnicode_XStrip(self, striptype, sep);
11720 else {
11721 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011722 "%s arg must be None or str",
11723 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011724 return NULL;
11725 }
11726 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011727
Benjamin Peterson14339b62009-01-31 16:36:08 +000011728 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011729}
11730
11731
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011732PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011733 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011734\n\
11735Return a copy of the string S with leading and trailing\n\
11736whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011737If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011738
11739static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011740unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011741{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011742 if (PyTuple_GET_SIZE(args) == 0)
11743 return do_strip(self, BOTHSTRIP); /* Common case */
11744 else
11745 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011746}
11747
11748
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011749PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011750 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011751\n\
11752Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011753If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011754
11755static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011756unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011757{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011758 if (PyTuple_GET_SIZE(args) == 0)
11759 return do_strip(self, LEFTSTRIP); /* Common case */
11760 else
11761 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011762}
11763
11764
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011765PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011766 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011767\n\
11768Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011769If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011770
11771static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011772unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011773{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011774 if (PyTuple_GET_SIZE(args) == 0)
11775 return do_strip(self, RIGHTSTRIP); /* Common case */
11776 else
11777 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011778}
11779
11780
Guido van Rossumd57fd912000-03-10 22:53:23 +000011781static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011782unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011783{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011784 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011785 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011786
Georg Brandl222de0f2009-04-12 12:01:50 +000011787 if (len < 1) {
11788 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011789 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011790 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011791
Victor Stinnerc4b49542011-12-11 22:44:26 +010011792 /* no repeat, return original string */
11793 if (len == 1)
11794 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000011795
Benjamin Petersonbac79492012-01-14 13:34:47 -050011796 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011797 return NULL;
11798
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011799 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011800 PyErr_SetString(PyExc_OverflowError,
11801 "repeated string is too long");
11802 return NULL;
11803 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011804 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011805
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011806 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011807 if (!u)
11808 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011809 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011810
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011811 if (PyUnicode_GET_LENGTH(str) == 1) {
11812 const int kind = PyUnicode_KIND(str);
11813 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010011814 if (kind == PyUnicode_1BYTE_KIND) {
11815 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011816 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010011817 }
11818 else if (kind == PyUnicode_2BYTE_KIND) {
11819 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011820 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010011821 ucs2[n] = fill_char;
11822 } else {
11823 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
11824 assert(kind == PyUnicode_4BYTE_KIND);
11825 for (n = 0; n < len; ++n)
11826 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011827 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011828 }
11829 else {
11830 /* number of characters copied this far */
11831 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011832 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011833 char *to = (char *) PyUnicode_DATA(u);
11834 Py_MEMCPY(to, PyUnicode_DATA(str),
11835 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011836 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011837 n = (done <= nchars-done) ? done : nchars-done;
11838 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011839 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011840 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011841 }
11842
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011843 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011844 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011845}
11846
Alexander Belopolsky40018472011-02-26 01:02:56 +000011847PyObject *
11848PyUnicode_Replace(PyObject *obj,
11849 PyObject *subobj,
11850 PyObject *replobj,
11851 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011852{
11853 PyObject *self;
11854 PyObject *str1;
11855 PyObject *str2;
11856 PyObject *result;
11857
11858 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011859 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011860 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011861 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011862 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011863 Py_DECREF(self);
11864 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011865 }
11866 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011867 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011868 Py_DECREF(self);
11869 Py_DECREF(str1);
11870 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011871 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060011872 if (PyUnicode_READY(self) == -1 ||
11873 PyUnicode_READY(str1) == -1 ||
11874 PyUnicode_READY(str2) == -1)
11875 result = NULL;
11876 else
11877 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011878 Py_DECREF(self);
11879 Py_DECREF(str1);
11880 Py_DECREF(str2);
11881 return result;
11882}
11883
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011884PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011885 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011886\n\
11887Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011888old replaced by new. If the optional argument count is\n\
11889given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011890
11891static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011892unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011893{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011894 PyObject *str1;
11895 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011896 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011897 PyObject *result;
11898
Martin v. Löwis18e16552006-02-15 17:27:45 +000011899 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011900 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060011901 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011902 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011903 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011904 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011905 return NULL;
11906 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011907 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011908 Py_DECREF(str1);
11909 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011910 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060011911 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
11912 result = NULL;
11913 else
11914 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011915
11916 Py_DECREF(str1);
11917 Py_DECREF(str2);
11918 return result;
11919}
11920
Alexander Belopolsky40018472011-02-26 01:02:56 +000011921static PyObject *
11922unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011923{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011924 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011925 Py_ssize_t isize;
11926 Py_ssize_t osize, squote, dquote, i, o;
11927 Py_UCS4 max, quote;
11928 int ikind, okind;
11929 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011930
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011931 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011932 return NULL;
11933
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011934 isize = PyUnicode_GET_LENGTH(unicode);
11935 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011936
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011937 /* Compute length of output, quote characters, and
11938 maximum character */
11939 osize = 2; /* quotes */
11940 max = 127;
11941 squote = dquote = 0;
11942 ikind = PyUnicode_KIND(unicode);
11943 for (i = 0; i < isize; i++) {
11944 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11945 switch (ch) {
11946 case '\'': squote++; osize++; break;
11947 case '"': dquote++; osize++; break;
11948 case '\\': case '\t': case '\r': case '\n':
11949 osize += 2; break;
11950 default:
11951 /* Fast-path ASCII */
11952 if (ch < ' ' || ch == 0x7f)
11953 osize += 4; /* \xHH */
11954 else if (ch < 0x7f)
11955 osize++;
11956 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11957 osize++;
11958 max = ch > max ? ch : max;
11959 }
11960 else if (ch < 0x100)
11961 osize += 4; /* \xHH */
11962 else if (ch < 0x10000)
11963 osize += 6; /* \uHHHH */
11964 else
11965 osize += 10; /* \uHHHHHHHH */
11966 }
11967 }
11968
11969 quote = '\'';
11970 if (squote) {
11971 if (dquote)
11972 /* Both squote and dquote present. Use squote,
11973 and escape them */
11974 osize += squote;
11975 else
11976 quote = '"';
11977 }
11978
11979 repr = PyUnicode_New(osize, max);
11980 if (repr == NULL)
11981 return NULL;
11982 okind = PyUnicode_KIND(repr);
11983 odata = PyUnicode_DATA(repr);
11984
11985 PyUnicode_WRITE(okind, odata, 0, quote);
11986 PyUnicode_WRITE(okind, odata, osize-1, quote);
11987
11988 for (i = 0, o = 1; i < isize; i++) {
11989 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011990
11991 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011992 if ((ch == quote) || (ch == '\\')) {
11993 PyUnicode_WRITE(okind, odata, o++, '\\');
11994 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011995 continue;
11996 }
11997
Benjamin Peterson29060642009-01-31 22:14:21 +000011998 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011999 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012000 PyUnicode_WRITE(okind, odata, o++, '\\');
12001 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012002 }
12003 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012004 PyUnicode_WRITE(okind, odata, o++, '\\');
12005 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012006 }
12007 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012008 PyUnicode_WRITE(okind, odata, o++, '\\');
12009 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012010 }
12011
12012 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012013 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012014 PyUnicode_WRITE(okind, odata, o++, '\\');
12015 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012016 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12017 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012018 }
12019
Georg Brandl559e5d72008-06-11 18:37:52 +000012020 /* Copy ASCII characters as-is */
12021 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012022 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012023 }
12024
Benjamin Peterson29060642009-01-31 22:14:21 +000012025 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000012026 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012027 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000012028 (categories Z* and C* except ASCII space)
12029 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012030 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012031 PyUnicode_WRITE(okind, odata, o++, '\\');
Georg Brandl559e5d72008-06-11 18:37:52 +000012032 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012033 if (ch <= 0xff) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012034 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012035 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12036 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012037 }
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012038 /* Map 16-bit characters to '\uxxxx' */
12039 else if (ch <= 0xffff) {
12040 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012041 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12042 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12043 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12044 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012045 }
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012046 /* Map 21-bit characters to '\U00xxxxxx' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012047 else {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012048 PyUnicode_WRITE(okind, odata, o++, 'U');
12049 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12050 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12051 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12052 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
Victor Stinnerf5cff562011-10-14 02:13:11 +020012053 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12054 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12055 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12056 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012057 }
12058 }
12059 /* Copy characters as-is */
12060 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012061 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012062 }
12063 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012064 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012065 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012066 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012067 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012068}
12069
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012070PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012071 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012072\n\
12073Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012074such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012075arguments start and end are interpreted as in slice notation.\n\
12076\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012077Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012078
12079static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012080unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012081{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012082 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012083 Py_ssize_t start;
12084 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012085 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012086
Jesus Ceaac451502011-04-20 17:09:23 +020012087 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12088 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012089 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012090
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012091 if (PyUnicode_READY(self) == -1)
12092 return NULL;
12093 if (PyUnicode_READY(substring) == -1)
12094 return NULL;
12095
Victor Stinner7931d9a2011-11-04 00:22:48 +010012096 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012097
12098 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012099
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012100 if (result == -2)
12101 return NULL;
12102
Christian Heimes217cfd12007-12-02 14:31:20 +000012103 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012104}
12105
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012106PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012107 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012108\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012109Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012110
12111static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012112unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012113{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012114 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012115 Py_ssize_t start;
12116 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012117 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012118
Jesus Ceaac451502011-04-20 17:09:23 +020012119 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12120 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012121 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012122
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012123 if (PyUnicode_READY(self) == -1)
12124 return NULL;
12125 if (PyUnicode_READY(substring) == -1)
12126 return NULL;
12127
Victor Stinner7931d9a2011-11-04 00:22:48 +010012128 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012129
12130 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012131
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012132 if (result == -2)
12133 return NULL;
12134
Guido van Rossumd57fd912000-03-10 22:53:23 +000012135 if (result < 0) {
12136 PyErr_SetString(PyExc_ValueError, "substring not found");
12137 return NULL;
12138 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012139
Christian Heimes217cfd12007-12-02 14:31:20 +000012140 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012141}
12142
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012143PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012144 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012145\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012146Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012147done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012148
12149static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012150unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012151{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012152 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012153 Py_UCS4 fillchar = ' ';
12154
Victor Stinnere9a29352011-10-01 02:14:59 +020012155 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012156 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012157
Benjamin Petersonbac79492012-01-14 13:34:47 -050012158 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012159 return NULL;
12160
Victor Stinnerc4b49542011-12-11 22:44:26 +010012161 if (PyUnicode_GET_LENGTH(self) >= width)
12162 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012163
Victor Stinnerc4b49542011-12-11 22:44:26 +010012164 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012165}
12166
Alexander Belopolsky40018472011-02-26 01:02:56 +000012167PyObject *
12168PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012169{
12170 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012171
Guido van Rossumd57fd912000-03-10 22:53:23 +000012172 s = PyUnicode_FromObject(s);
12173 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012174 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012175 if (sep != NULL) {
12176 sep = PyUnicode_FromObject(sep);
12177 if (sep == NULL) {
12178 Py_DECREF(s);
12179 return NULL;
12180 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012181 }
12182
Victor Stinner9310abb2011-10-05 00:59:23 +020012183 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012184
12185 Py_DECREF(s);
12186 Py_XDECREF(sep);
12187 return result;
12188}
12189
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012190PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012191 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012192\n\
12193Return a list of the words in S, using sep as the\n\
12194delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012195splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012196whitespace string is a separator and empty strings are\n\
12197removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012198
12199static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012200unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012201{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012202 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012203 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012204 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012205
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012206 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12207 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012208 return NULL;
12209
12210 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012211 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012212 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012213 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012214 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012215 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012216}
12217
Thomas Wouters477c8d52006-05-27 19:21:47 +000012218PyObject *
12219PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12220{
12221 PyObject* str_obj;
12222 PyObject* sep_obj;
12223 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012224 int kind1, kind2, kind;
12225 void *buf1 = NULL, *buf2 = NULL;
12226 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012227
12228 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012229 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012230 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012231 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012232 if (!sep_obj) {
12233 Py_DECREF(str_obj);
12234 return NULL;
12235 }
12236 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12237 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012238 Py_DECREF(str_obj);
12239 return NULL;
12240 }
12241
Victor Stinner14f8f022011-10-05 20:58:25 +020012242 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012243 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012244 kind = Py_MAX(kind1, kind2);
12245 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012246 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012247 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012248 if (!buf1)
12249 goto onError;
12250 buf2 = PyUnicode_DATA(sep_obj);
12251 if (kind2 != kind)
12252 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12253 if (!buf2)
12254 goto onError;
12255 len1 = PyUnicode_GET_LENGTH(str_obj);
12256 len2 = PyUnicode_GET_LENGTH(sep_obj);
12257
Benjamin Petersonead6b532011-12-20 17:23:42 -060012258 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012259 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012260 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12261 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12262 else
12263 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012264 break;
12265 case PyUnicode_2BYTE_KIND:
12266 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12267 break;
12268 case PyUnicode_4BYTE_KIND:
12269 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12270 break;
12271 default:
12272 assert(0);
12273 out = 0;
12274 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012275
12276 Py_DECREF(sep_obj);
12277 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012278 if (kind1 != kind)
12279 PyMem_Free(buf1);
12280 if (kind2 != kind)
12281 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012282
12283 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012284 onError:
12285 Py_DECREF(sep_obj);
12286 Py_DECREF(str_obj);
12287 if (kind1 != kind && buf1)
12288 PyMem_Free(buf1);
12289 if (kind2 != kind && buf2)
12290 PyMem_Free(buf2);
12291 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012292}
12293
12294
12295PyObject *
12296PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12297{
12298 PyObject* str_obj;
12299 PyObject* sep_obj;
12300 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012301 int kind1, kind2, kind;
12302 void *buf1 = NULL, *buf2 = NULL;
12303 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012304
12305 str_obj = PyUnicode_FromObject(str_in);
12306 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012307 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012308 sep_obj = PyUnicode_FromObject(sep_in);
12309 if (!sep_obj) {
12310 Py_DECREF(str_obj);
12311 return NULL;
12312 }
12313
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012314 kind1 = PyUnicode_KIND(str_in);
12315 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012316 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012317 buf1 = PyUnicode_DATA(str_in);
12318 if (kind1 != kind)
12319 buf1 = _PyUnicode_AsKind(str_in, kind);
12320 if (!buf1)
12321 goto onError;
12322 buf2 = PyUnicode_DATA(sep_obj);
12323 if (kind2 != kind)
12324 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12325 if (!buf2)
12326 goto onError;
12327 len1 = PyUnicode_GET_LENGTH(str_obj);
12328 len2 = PyUnicode_GET_LENGTH(sep_obj);
12329
Benjamin Petersonead6b532011-12-20 17:23:42 -060012330 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012331 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012332 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12333 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12334 else
12335 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012336 break;
12337 case PyUnicode_2BYTE_KIND:
12338 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12339 break;
12340 case PyUnicode_4BYTE_KIND:
12341 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12342 break;
12343 default:
12344 assert(0);
12345 out = 0;
12346 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012347
12348 Py_DECREF(sep_obj);
12349 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012350 if (kind1 != kind)
12351 PyMem_Free(buf1);
12352 if (kind2 != kind)
12353 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012354
12355 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012356 onError:
12357 Py_DECREF(sep_obj);
12358 Py_DECREF(str_obj);
12359 if (kind1 != kind && buf1)
12360 PyMem_Free(buf1);
12361 if (kind2 != kind && buf2)
12362 PyMem_Free(buf2);
12363 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012364}
12365
12366PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012367 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012368\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012369Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012370the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012371found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012372
12373static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012374unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012375{
Victor Stinner9310abb2011-10-05 00:59:23 +020012376 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012377}
12378
12379PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012380 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012381\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012382Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012383the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012384separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012385
12386static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012387unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012388{
Victor Stinner9310abb2011-10-05 00:59:23 +020012389 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012390}
12391
Alexander Belopolsky40018472011-02-26 01:02:56 +000012392PyObject *
12393PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012394{
12395 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012396
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012397 s = PyUnicode_FromObject(s);
12398 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012399 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012400 if (sep != NULL) {
12401 sep = PyUnicode_FromObject(sep);
12402 if (sep == NULL) {
12403 Py_DECREF(s);
12404 return NULL;
12405 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012406 }
12407
Victor Stinner9310abb2011-10-05 00:59:23 +020012408 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012409
12410 Py_DECREF(s);
12411 Py_XDECREF(sep);
12412 return result;
12413}
12414
12415PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012416 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012417\n\
12418Return a list of the words in S, using sep as the\n\
12419delimiter string, starting at the end of the string and\n\
12420working to the front. If maxsplit is given, at most maxsplit\n\
12421splits are done. If sep is not specified, any whitespace string\n\
12422is a separator.");
12423
12424static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012425unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012426{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012427 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012428 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012429 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012430
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012431 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12432 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012433 return NULL;
12434
12435 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012436 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012437 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012438 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012439 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012440 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012441}
12442
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012443PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012444 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012445\n\
12446Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012447Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012448is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012449
12450static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012451unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012452{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012453 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012454 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012455
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012456 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12457 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012458 return NULL;
12459
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012460 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012461}
12462
12463static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012464PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012465{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012466 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012467}
12468
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012469PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012470 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012471\n\
12472Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012473and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012474
12475static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012476unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012477{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012478 if (PyUnicode_READY(self) == -1)
12479 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012480 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012481}
12482
Georg Brandlceee0772007-11-27 23:48:05 +000012483PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012484 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012485\n\
12486Return a translation table usable for str.translate().\n\
12487If there is only one argument, it must be a dictionary mapping Unicode\n\
12488ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012489Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012490If there are two arguments, they must be strings of equal length, and\n\
12491in the resulting dictionary, each character in x will be mapped to the\n\
12492character at the same position in y. If there is a third argument, it\n\
12493must be a string, whose characters will be mapped to None in the result.");
12494
12495static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012496unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012497{
12498 PyObject *x, *y = NULL, *z = NULL;
12499 PyObject *new = NULL, *key, *value;
12500 Py_ssize_t i = 0;
12501 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012502
Georg Brandlceee0772007-11-27 23:48:05 +000012503 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12504 return NULL;
12505 new = PyDict_New();
12506 if (!new)
12507 return NULL;
12508 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012509 int x_kind, y_kind, z_kind;
12510 void *x_data, *y_data, *z_data;
12511
Georg Brandlceee0772007-11-27 23:48:05 +000012512 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012513 if (!PyUnicode_Check(x)) {
12514 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12515 "be a string if there is a second argument");
12516 goto err;
12517 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012518 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012519 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12520 "arguments must have equal length");
12521 goto err;
12522 }
12523 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012524 x_kind = PyUnicode_KIND(x);
12525 y_kind = PyUnicode_KIND(y);
12526 x_data = PyUnicode_DATA(x);
12527 y_data = PyUnicode_DATA(y);
12528 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12529 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012530 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012531 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012532 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012533 if (!value) {
12534 Py_DECREF(key);
12535 goto err;
12536 }
Georg Brandlceee0772007-11-27 23:48:05 +000012537 res = PyDict_SetItem(new, key, value);
12538 Py_DECREF(key);
12539 Py_DECREF(value);
12540 if (res < 0)
12541 goto err;
12542 }
12543 /* create entries for deleting chars in z */
12544 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012545 z_kind = PyUnicode_KIND(z);
12546 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012547 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012548 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012549 if (!key)
12550 goto err;
12551 res = PyDict_SetItem(new, key, Py_None);
12552 Py_DECREF(key);
12553 if (res < 0)
12554 goto err;
12555 }
12556 }
12557 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012558 int kind;
12559 void *data;
12560
Georg Brandlceee0772007-11-27 23:48:05 +000012561 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012562 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012563 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12564 "to maketrans it must be a dict");
12565 goto err;
12566 }
12567 /* copy entries into the new dict, converting string keys to int keys */
12568 while (PyDict_Next(x, &i, &key, &value)) {
12569 if (PyUnicode_Check(key)) {
12570 /* convert string keys to integer keys */
12571 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012572 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012573 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12574 "table must be of length 1");
12575 goto err;
12576 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012577 kind = PyUnicode_KIND(key);
12578 data = PyUnicode_DATA(key);
12579 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012580 if (!newkey)
12581 goto err;
12582 res = PyDict_SetItem(new, newkey, value);
12583 Py_DECREF(newkey);
12584 if (res < 0)
12585 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012586 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012587 /* just keep integer keys */
12588 if (PyDict_SetItem(new, key, value) < 0)
12589 goto err;
12590 } else {
12591 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12592 "be strings or integers");
12593 goto err;
12594 }
12595 }
12596 }
12597 return new;
12598 err:
12599 Py_DECREF(new);
12600 return NULL;
12601}
12602
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012603PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012604 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012605\n\
12606Return a copy of the string S, where all characters have been mapped\n\
12607through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012608Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012609Unmapped characters are left untouched. Characters mapped to None\n\
12610are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012611
12612static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012613unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012614{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012615 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012616}
12617
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012618PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012619 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012620\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012621Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012622
12623static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012624unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012625{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012626 if (PyUnicode_READY(self) == -1)
12627 return NULL;
12628 if (PyUnicode_IS_ASCII(self))
12629 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012630 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012631}
12632
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012633PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012634 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012635\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012636Pad a numeric string S with zeros on the left, to fill a field\n\
12637of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012638
12639static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012640unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012641{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012642 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012643 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012644 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012645 int kind;
12646 void *data;
12647 Py_UCS4 chr;
12648
Martin v. Löwis18e16552006-02-15 17:27:45 +000012649 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012650 return NULL;
12651
Benjamin Petersonbac79492012-01-14 13:34:47 -050012652 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012653 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012654
Victor Stinnerc4b49542011-12-11 22:44:26 +010012655 if (PyUnicode_GET_LENGTH(self) >= width)
12656 return unicode_result_unchanged(self);
12657
12658 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012659
12660 u = pad(self, fill, 0, '0');
12661
Walter Dörwald068325e2002-04-15 13:36:47 +000012662 if (u == NULL)
12663 return NULL;
12664
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012665 kind = PyUnicode_KIND(u);
12666 data = PyUnicode_DATA(u);
12667 chr = PyUnicode_READ(kind, data, fill);
12668
12669 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012670 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012671 PyUnicode_WRITE(kind, data, 0, chr);
12672 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012673 }
12674
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012675 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012676 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012677}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012678
12679#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012680static PyObject *
12681unicode__decimal2ascii(PyObject *self)
12682{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012683 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012684}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012685#endif
12686
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012687PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012688 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012689\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012690Return True if S starts with the specified prefix, False otherwise.\n\
12691With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012692With optional end, stop comparing S at that position.\n\
12693prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012694
12695static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012696unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012697 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012698{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012699 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012700 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012701 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012702 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012703 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012704
Jesus Ceaac451502011-04-20 17:09:23 +020012705 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012706 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012707 if (PyTuple_Check(subobj)) {
12708 Py_ssize_t i;
12709 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012710 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012711 if (substring == NULL)
12712 return NULL;
12713 result = tailmatch(self, substring, start, end, -1);
12714 Py_DECREF(substring);
12715 if (result) {
12716 Py_RETURN_TRUE;
12717 }
12718 }
12719 /* nothing matched */
12720 Py_RETURN_FALSE;
12721 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012722 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012723 if (substring == NULL) {
12724 if (PyErr_ExceptionMatches(PyExc_TypeError))
12725 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12726 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012727 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012728 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012729 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012730 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012731 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012732}
12733
12734
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012735PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012736 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012737\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012738Return True if S ends with the specified suffix, False otherwise.\n\
12739With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012740With optional end, stop comparing S at that position.\n\
12741suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012742
12743static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012744unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012745 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012746{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012747 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012748 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012749 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012750 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012751 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012752
Jesus Ceaac451502011-04-20 17:09:23 +020012753 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012754 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012755 if (PyTuple_Check(subobj)) {
12756 Py_ssize_t i;
12757 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012758 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012759 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012760 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012761 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012762 result = tailmatch(self, substring, start, end, +1);
12763 Py_DECREF(substring);
12764 if (result) {
12765 Py_RETURN_TRUE;
12766 }
12767 }
12768 Py_RETURN_FALSE;
12769 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012770 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012771 if (substring == NULL) {
12772 if (PyErr_ExceptionMatches(PyExc_TypeError))
12773 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12774 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012775 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012776 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012777 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012778 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012779 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012780}
12781
Victor Stinner202fdca2012-05-07 12:47:02 +020012782Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012783_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012784{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012785 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012786 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
12787 writer->data = PyUnicode_DATA(writer->buffer);
12788 writer->kind = PyUnicode_KIND(writer->buffer);
12789}
12790
Victor Stinnerd3f08822012-05-29 12:57:52 +020012791void
12792_PyUnicodeWriter_Init(_PyUnicodeWriter *writer, Py_ssize_t min_length)
Victor Stinner202fdca2012-05-07 12:47:02 +020012793{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012794 memset(writer, 0, sizeof(*writer));
12795#ifdef Py_DEBUG
12796 writer->kind = 5; /* invalid kind */
12797#endif
12798 writer->min_length = Py_MAX(min_length, 100);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012799 writer->overallocate = (min_length > 0);
Victor Stinner202fdca2012-05-07 12:47:02 +020012800}
12801
Victor Stinnerd3f08822012-05-29 12:57:52 +020012802int
12803_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
12804 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020012805{
12806 Py_ssize_t newlen;
12807 PyObject *newbuffer;
12808
Victor Stinnerd3f08822012-05-29 12:57:52 +020012809 assert(length > 0);
12810
Victor Stinner202fdca2012-05-07 12:47:02 +020012811 if (length > PY_SSIZE_T_MAX - writer->pos) {
12812 PyErr_NoMemory();
12813 return -1;
12814 }
12815 newlen = writer->pos + length;
12816
Victor Stinnerd3f08822012-05-29 12:57:52 +020012817 if (writer->buffer == NULL) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012818 if (writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012819 /* overallocate 25% to limit the number of resize */
12820 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12821 newlen += newlen / 4;
12822 if (newlen < writer->min_length)
12823 newlen = writer->min_length;
12824 }
12825 writer->buffer = PyUnicode_New(newlen, maxchar);
12826 if (writer->buffer == NULL)
12827 return -1;
12828 _PyUnicodeWriter_Update(writer);
12829 return 0;
12830 }
Victor Stinner202fdca2012-05-07 12:47:02 +020012831
Victor Stinnerd3f08822012-05-29 12:57:52 +020012832 if (newlen > writer->size) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012833 if (writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012834 /* overallocate 25% to limit the number of resize */
12835 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12836 newlen += newlen / 4;
12837 if (newlen < writer->min_length)
12838 newlen = writer->min_length;
12839 }
12840
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012841 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020012842 /* resize + widen */
12843 newbuffer = PyUnicode_New(newlen, maxchar);
12844 if (newbuffer == NULL)
12845 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012846 _PyUnicode_FastCopyCharacters(newbuffer, 0,
12847 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020012848 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012849 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020012850 }
12851 else {
12852 newbuffer = resize_compact(writer->buffer, newlen);
12853 if (newbuffer == NULL)
12854 return -1;
12855 }
12856 writer->buffer = newbuffer;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012857 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012858 }
12859 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012860 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012861 newbuffer = PyUnicode_New(writer->size, maxchar);
12862 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020012863 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012864 _PyUnicode_FastCopyCharacters(newbuffer, 0,
12865 writer->buffer, 0, writer->pos);
12866 Py_DECREF(writer->buffer);
12867 writer->buffer = newbuffer;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012868 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012869 }
12870 return 0;
12871}
12872
Victor Stinnerd3f08822012-05-29 12:57:52 +020012873int
12874_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
12875{
12876 Py_UCS4 maxchar;
12877 Py_ssize_t len;
12878
12879 if (PyUnicode_READY(str) == -1)
12880 return -1;
12881 len = PyUnicode_GET_LENGTH(str);
12882 if (len == 0)
12883 return 0;
12884 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
12885 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012886 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012887 Py_INCREF(str);
12888 writer->buffer = str;
12889 _PyUnicodeWriter_Update(writer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012890 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012891 writer->size = 0;
12892 writer->pos += len;
12893 return 0;
12894 }
12895 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
12896 return -1;
12897 }
12898 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
12899 str, 0, len);
12900 writer->pos += len;
12901 return 0;
12902}
12903
12904PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012905_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012906{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012907 if (writer->pos == 0) {
12908 Py_XDECREF(writer->buffer);
12909 Py_INCREF(unicode_empty);
12910 return unicode_empty;
12911 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012912 if (writer->readonly) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012913 assert(PyUnicode_GET_LENGTH(writer->buffer) == writer->pos);
12914 return writer->buffer;
12915 }
12916 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
12917 PyObject *newbuffer;
12918 newbuffer = resize_compact(writer->buffer, writer->pos);
12919 if (newbuffer == NULL) {
12920 Py_DECREF(writer->buffer);
12921 return NULL;
12922 }
12923 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020012924 }
Victor Stinnerf59c28c2012-05-09 03:24:14 +020012925 assert(_PyUnicode_CheckConsistency(writer->buffer, 1));
Victor Stinner202fdca2012-05-07 12:47:02 +020012926 return writer->buffer;
12927}
12928
Victor Stinnerd3f08822012-05-29 12:57:52 +020012929void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012930_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012931{
12932 Py_CLEAR(writer->buffer);
12933}
12934
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012935#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012936
12937PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012938 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012939\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012940Return a formatted version of S, using substitutions from args and kwargs.\n\
12941The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012942
Eric Smith27bbca62010-11-04 17:06:58 +000012943PyDoc_STRVAR(format_map__doc__,
12944 "S.format_map(mapping) -> str\n\
12945\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012946Return a formatted version of S, using substitutions from mapping.\n\
12947The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012948
Eric Smith4a7d76d2008-05-30 18:10:19 +000012949static PyObject *
12950unicode__format__(PyObject* self, PyObject* args)
12951{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012952 PyObject *format_spec;
12953 _PyUnicodeWriter writer;
12954 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012955
12956 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12957 return NULL;
12958
Victor Stinnerd3f08822012-05-29 12:57:52 +020012959 if (PyUnicode_READY(self) == -1)
12960 return NULL;
12961 _PyUnicodeWriter_Init(&writer, 0);
12962 ret = _PyUnicode_FormatAdvancedWriter(&writer,
12963 self, format_spec, 0,
12964 PyUnicode_GET_LENGTH(format_spec));
12965 if (ret == -1) {
12966 _PyUnicodeWriter_Dealloc(&writer);
12967 return NULL;
12968 }
12969 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000012970}
12971
Eric Smith8c663262007-08-25 02:26:07 +000012972PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012973 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012974\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012975Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012976
12977static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012978unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012979{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012980 Py_ssize_t size;
12981
12982 /* If it's a compact object, account for base structure +
12983 character data. */
12984 if (PyUnicode_IS_COMPACT_ASCII(v))
12985 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12986 else if (PyUnicode_IS_COMPACT(v))
12987 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012988 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012989 else {
12990 /* If it is a two-block object, account for base object, and
12991 for character block if present. */
12992 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012993 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012994 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012995 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012996 }
12997 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012998 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012999 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013000 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013001 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013002 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013003
13004 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013005}
13006
13007PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013008 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013009
13010static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013011unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013012{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013013 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013014 if (!copy)
13015 return NULL;
13016 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013017}
13018
Guido van Rossumd57fd912000-03-10 22:53:23 +000013019static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013020 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013021 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013022 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13023 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013024 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13025 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013026 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013027 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13028 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13029 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13030 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
13031 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013032 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013033 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13034 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13035 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013036 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013037 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13038 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13039 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013040 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013041 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010013042 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013043 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013044 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13045 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13046 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13047 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13048 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13049 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13050 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13051 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13052 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13053 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13054 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13055 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13056 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13057 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013058 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013059 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013060 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013061 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013062 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013063 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000013064 {"maketrans", (PyCFunction) unicode_maketrans,
13065 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013066 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013067#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013068 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013069 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013070#endif
13071
Benjamin Peterson14339b62009-01-31 16:36:08 +000013072 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013073 {NULL, NULL}
13074};
13075
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013076static PyObject *
13077unicode_mod(PyObject *v, PyObject *w)
13078{
Brian Curtindfc80e32011-08-10 20:28:54 -050013079 if (!PyUnicode_Check(v))
13080 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013081 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013082}
13083
13084static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013085 0, /*nb_add*/
13086 0, /*nb_subtract*/
13087 0, /*nb_multiply*/
13088 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013089};
13090
Guido van Rossumd57fd912000-03-10 22:53:23 +000013091static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013092 (lenfunc) unicode_length, /* sq_length */
13093 PyUnicode_Concat, /* sq_concat */
13094 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13095 (ssizeargfunc) unicode_getitem, /* sq_item */
13096 0, /* sq_slice */
13097 0, /* sq_ass_item */
13098 0, /* sq_ass_slice */
13099 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013100};
13101
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013102static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013103unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013104{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013105 if (PyUnicode_READY(self) == -1)
13106 return NULL;
13107
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013108 if (PyIndex_Check(item)) {
13109 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013110 if (i == -1 && PyErr_Occurred())
13111 return NULL;
13112 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013113 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013114 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013115 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013116 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013117 PyObject *result;
13118 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013119 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013120 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013121
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013122 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013123 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013124 return NULL;
13125 }
13126
13127 if (slicelength <= 0) {
Victor Stinner382955f2011-12-11 21:44:00 +010013128 Py_INCREF(unicode_empty);
13129 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013130 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013131 slicelength == PyUnicode_GET_LENGTH(self)) {
13132 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013133 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013134 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013135 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013136 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013137 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013138 src_kind = PyUnicode_KIND(self);
13139 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013140 if (!PyUnicode_IS_ASCII(self)) {
13141 kind_limit = kind_maxchar_limit(src_kind);
13142 max_char = 0;
13143 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13144 ch = PyUnicode_READ(src_kind, src_data, cur);
13145 if (ch > max_char) {
13146 max_char = ch;
13147 if (max_char >= kind_limit)
13148 break;
13149 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013150 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013151 }
Victor Stinner55c99112011-10-13 01:17:06 +020013152 else
13153 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013154 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013155 if (result == NULL)
13156 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013157 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013158 dest_data = PyUnicode_DATA(result);
13159
13160 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013161 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13162 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013163 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013164 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013165 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013166 } else {
13167 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13168 return NULL;
13169 }
13170}
13171
13172static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013173 (lenfunc)unicode_length, /* mp_length */
13174 (binaryfunc)unicode_subscript, /* mp_subscript */
13175 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013176};
13177
Guido van Rossumd57fd912000-03-10 22:53:23 +000013178
Guido van Rossumd57fd912000-03-10 22:53:23 +000013179/* Helpers for PyUnicode_Format() */
13180
13181static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000013182getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013183{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013184 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013185 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013186 (*p_argidx)++;
13187 if (arglen < 0)
13188 return args;
13189 else
13190 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013191 }
13192 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013193 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013194 return NULL;
13195}
13196
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013197/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013198
Victor Stinnerd3f08822012-05-29 12:57:52 +020013199static int
13200formatfloat(PyObject *v, int flags, int prec, int type,
13201 PyObject **p_output, _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013202{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013203 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013204 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013205 Py_ssize_t len;
Tim Petersced69f82003-09-16 20:30:58 +000013206
Guido van Rossumd57fd912000-03-10 22:53:23 +000013207 x = PyFloat_AsDouble(v);
13208 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013209 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013210
Guido van Rossumd57fd912000-03-10 22:53:23 +000013211 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013212 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013213
Eric Smith0923d1d2009-04-16 20:16:10 +000013214 p = PyOS_double_to_string(x, type, prec,
13215 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013216 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013217 return -1;
13218 len = strlen(p);
13219 if (writer) {
13220 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13221 return -1;
Victor Stinner3a7d0962012-05-29 18:53:56 +020013222 memcpy((char*)writer->data + writer->pos * writer->kind,
Victor Stinnerd3f08822012-05-29 12:57:52 +020013223 p,
13224 len);
13225 writer->pos += len;
13226 }
13227 else
13228 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013229 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013230 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013231}
13232
Victor Stinnerd0880d52012-04-27 23:40:13 +020013233/* formatlong() emulates the format codes d, u, o, x and X, and
13234 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13235 * Python's regular ints.
13236 * Return value: a new PyUnicodeObject*, or NULL if error.
13237 * The output string is of the form
13238 * "-"? ("0x" | "0X")? digit+
13239 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13240 * set in flags. The case of hex digits will be correct,
13241 * There will be at least prec digits, zero-filled on the left if
13242 * necessary to get that many.
13243 * val object to be converted
13244 * flags bitmask of format flags; only F_ALT is looked at
13245 * prec minimum number of digits; 0-fill on left if needed
13246 * type a character in [duoxX]; u acts the same as d
13247 *
13248 * CAUTION: o, x and X conversions on regular ints can never
13249 * produce a '-' sign, but can for Python's unbounded ints.
13250 */
Tim Peters38fd5b62000-09-21 05:43:11 +000013251static PyObject*
13252formatlong(PyObject *val, int flags, int prec, int type)
13253{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013254 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013255 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013256 Py_ssize_t i;
13257 int sign; /* 1 if '-', else 0 */
13258 int len; /* number of characters */
13259 Py_ssize_t llen;
13260 int numdigits; /* len == numnondigits + numdigits */
13261 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000013262
Victor Stinnerd0880d52012-04-27 23:40:13 +020013263 /* Avoid exceeding SSIZE_T_MAX */
13264 if (prec > INT_MAX-3) {
13265 PyErr_SetString(PyExc_OverflowError,
13266 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013267 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013268 }
13269
13270 assert(PyLong_Check(val));
13271
13272 switch (type) {
13273 case 'd':
13274 case 'u':
13275 /* Special-case boolean: we want 0/1 */
Victor Stinnerb11d91d2012-04-28 00:25:34 +020013276 if (PyBool_Check(val))
13277 result = PyNumber_ToBase(val, 10);
13278 else
13279 result = Py_TYPE(val)->tp_str(val);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013280 break;
13281 case 'o':
13282 numnondigits = 2;
13283 result = PyNumber_ToBase(val, 8);
13284 break;
13285 case 'x':
13286 case 'X':
13287 numnondigits = 2;
13288 result = PyNumber_ToBase(val, 16);
13289 break;
13290 default:
13291 assert(!"'type' not in [duoxX]");
13292 }
13293 if (!result)
13294 return NULL;
13295
13296 assert(unicode_modifiable(result));
13297 assert(PyUnicode_IS_READY(result));
13298 assert(PyUnicode_IS_ASCII(result));
13299
13300 /* To modify the string in-place, there can only be one reference. */
13301 if (Py_REFCNT(result) != 1) {
13302 PyErr_BadInternalCall();
13303 return NULL;
13304 }
13305 buf = PyUnicode_DATA(result);
13306 llen = PyUnicode_GET_LENGTH(result);
13307 if (llen > INT_MAX) {
13308 PyErr_SetString(PyExc_ValueError,
13309 "string too large in _PyBytes_FormatLong");
13310 return NULL;
13311 }
13312 len = (int)llen;
13313 sign = buf[0] == '-';
13314 numnondigits += sign;
13315 numdigits = len - numnondigits;
13316 assert(numdigits > 0);
13317
13318 /* Get rid of base marker unless F_ALT */
13319 if (((flags & F_ALT) == 0 &&
13320 (type == 'o' || type == 'x' || type == 'X'))) {
13321 assert(buf[sign] == '0');
13322 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13323 buf[sign+1] == 'o');
13324 numnondigits -= 2;
13325 buf += 2;
13326 len -= 2;
13327 if (sign)
13328 buf[0] = '-';
13329 assert(len == numnondigits + numdigits);
13330 assert(numdigits > 0);
13331 }
13332
13333 /* Fill with leading zeroes to meet minimum width. */
13334 if (prec > numdigits) {
13335 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13336 numnondigits + prec);
13337 char *b1;
13338 if (!r1) {
13339 Py_DECREF(result);
13340 return NULL;
13341 }
13342 b1 = PyBytes_AS_STRING(r1);
13343 for (i = 0; i < numnondigits; ++i)
13344 *b1++ = *buf++;
13345 for (i = 0; i < prec - numdigits; i++)
13346 *b1++ = '0';
13347 for (i = 0; i < numdigits; i++)
13348 *b1++ = *buf++;
13349 *b1 = '\0';
13350 Py_DECREF(result);
13351 result = r1;
13352 buf = PyBytes_AS_STRING(result);
13353 len = numnondigits + prec;
13354 }
13355
13356 /* Fix up case for hex conversions. */
13357 if (type == 'X') {
13358 /* Need to convert all lower case letters to upper case.
13359 and need to convert 0x to 0X (and -0x to -0X). */
13360 for (i = 0; i < len; i++)
13361 if (buf[i] >= 'a' && buf[i] <= 'x')
13362 buf[i] -= 'a'-'A';
13363 }
13364 if (!PyUnicode_Check(result) || len != PyUnicode_GET_LENGTH(result)) {
13365 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013366 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013367 Py_DECREF(result);
13368 result = unicode;
13369 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013370 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013371}
13372
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013373static Py_UCS4
13374formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013375{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013376 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013377 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013378 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013379 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013380 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013381 goto onError;
13382 }
13383 else {
13384 /* Integer input truncated to a character */
13385 long x;
13386 x = PyLong_AsLong(v);
13387 if (x == -1 && PyErr_Occurred())
13388 goto onError;
13389
Victor Stinner8faf8212011-12-08 22:14:11 +010013390 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013391 PyErr_SetString(PyExc_OverflowError,
13392 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013393 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013394 }
13395
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013396 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013397 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013398
Benjamin Peterson29060642009-01-31 22:14:21 +000013399 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013400 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013401 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013402 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013403}
13404
Alexander Belopolsky40018472011-02-26 01:02:56 +000013405PyObject *
13406PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013407{
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013408 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013409 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013410 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013411 PyObject *temp = NULL;
13412 PyObject *second = NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013413 PyObject *uformat;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013414 void *fmt;
13415 enum PyUnicode_Kind kind, fmtkind;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013416 _PyUnicodeWriter writer;
Victor Stinneree4544c2012-05-09 22:24:08 +020013417 Py_ssize_t sublen;
13418 Py_UCS4 maxchar;
Tim Petersced69f82003-09-16 20:30:58 +000013419
Guido van Rossumd57fd912000-03-10 22:53:23 +000013420 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013421 PyErr_BadInternalCall();
13422 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013423 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013424 uformat = PyUnicode_FromObject(format);
Benjamin Peterson22a29702012-01-02 09:00:30 -060013425 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013426 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060013427 if (PyUnicode_READY(uformat) == -1)
13428 Py_DECREF(uformat);
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013429
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013430 fmt = PyUnicode_DATA(uformat);
13431 fmtkind = PyUnicode_KIND(uformat);
13432 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13433 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013434
Victor Stinnerd3f08822012-05-29 12:57:52 +020013435 _PyUnicodeWriter_Init(&writer, fmtcnt + 100);
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013436
Guido van Rossumd57fd912000-03-10 22:53:23 +000013437 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013438 arglen = PyTuple_Size(args);
13439 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013440 }
13441 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013442 arglen = -1;
13443 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013444 }
Christian Heimes90aa7642007-12-19 02:45:37 +000013445 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000013446 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013447 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013448
13449 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013450 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013451 Py_ssize_t nonfmtpos;
13452 nonfmtpos = fmtpos++;
13453 while (fmtcnt >= 0 &&
13454 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13455 fmtpos++;
13456 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013457 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013458 if (fmtcnt < 0)
13459 fmtpos--;
Victor Stinneree4544c2012-05-09 22:24:08 +020013460 sublen = fmtpos - nonfmtpos;
13461 maxchar = _PyUnicode_FindMaxChar(uformat,
13462 nonfmtpos, nonfmtpos + sublen);
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013463 if (_PyUnicodeWriter_Prepare(&writer, sublen, maxchar) == -1)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013464 goto onError;
Victor Stinneree4544c2012-05-09 22:24:08 +020013465
Victor Stinnerd3f08822012-05-29 12:57:52 +020013466 _PyUnicode_FastCopyCharacters(writer.buffer, writer.pos,
13467 uformat, nonfmtpos, sublen);
Victor Stinneree4544c2012-05-09 22:24:08 +020013468 writer.pos += sublen;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013469 }
13470 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013471 /* Got a format specifier */
13472 int flags = 0;
13473 Py_ssize_t width = -1;
13474 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013475 Py_UCS4 c = '\0';
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013476 Py_UCS4 fill;
13477 int sign;
13478 Py_UCS4 signchar;
Benjamin Peterson29060642009-01-31 22:14:21 +000013479 int isnumok;
13480 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013481 void *pbuf = NULL;
13482 Py_ssize_t pindex, len;
Victor Stinneree4544c2012-05-09 22:24:08 +020013483 Py_UCS4 bufmaxchar;
13484 Py_ssize_t buflen;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013485
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013486 fmtpos++;
Victor Stinner438106b2012-05-02 00:41:57 +020013487 c = PyUnicode_READ(fmtkind, fmt, fmtpos);
13488 if (c == '(') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013489 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013490 Py_ssize_t keylen;
13491 PyObject *key;
13492 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013493
Benjamin Peterson29060642009-01-31 22:14:21 +000013494 if (dict == NULL) {
13495 PyErr_SetString(PyExc_TypeError,
13496 "format requires a mapping");
13497 goto onError;
13498 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013499 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013500 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013501 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013502 /* Skip over balanced parentheses */
13503 while (pcount > 0 && --fmtcnt >= 0) {
Victor Stinnerbff7c962012-05-03 01:44:59 +020013504 c = PyUnicode_READ(fmtkind, fmt, fmtpos);
13505 if (c == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013506 --pcount;
Victor Stinnerbff7c962012-05-03 01:44:59 +020013507 else if (c == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013508 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013509 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013510 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013511 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013512 if (fmtcnt < 0 || pcount > 0) {
13513 PyErr_SetString(PyExc_ValueError,
13514 "incomplete format key");
13515 goto onError;
13516 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013517 key = PyUnicode_Substring(uformat,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013518 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013519 if (key == NULL)
13520 goto onError;
13521 if (args_owned) {
13522 Py_DECREF(args);
13523 args_owned = 0;
13524 }
13525 args = PyObject_GetItem(dict, key);
13526 Py_DECREF(key);
13527 if (args == NULL) {
13528 goto onError;
13529 }
13530 args_owned = 1;
13531 arglen = -1;
13532 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013533 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013534 while (--fmtcnt >= 0) {
Victor Stinner438106b2012-05-02 00:41:57 +020013535 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13536 switch (c) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013537 case '-': flags |= F_LJUST; continue;
13538 case '+': flags |= F_SIGN; continue;
13539 case ' ': flags |= F_BLANK; continue;
13540 case '#': flags |= F_ALT; continue;
13541 case '0': flags |= F_ZERO; continue;
13542 }
13543 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013544 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013545 if (c == '*') {
13546 v = getnextarg(args, arglen, &argidx);
13547 if (v == NULL)
13548 goto onError;
13549 if (!PyLong_Check(v)) {
13550 PyErr_SetString(PyExc_TypeError,
13551 "* wants int");
13552 goto onError;
13553 }
13554 width = PyLong_AsLong(v);
13555 if (width == -1 && PyErr_Occurred())
13556 goto onError;
13557 if (width < 0) {
13558 flags |= F_LJUST;
13559 width = -width;
13560 }
13561 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013562 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013563 }
13564 else if (c >= '0' && c <= '9') {
13565 width = c - '0';
13566 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013567 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013568 if (c < '0' || c > '9')
13569 break;
Martin v. Löwisb05c0732012-05-15 13:45:49 +020013570 /* Since c is unsigned, the RHS would end up as unsigned,
13571 mixing signed and unsigned comparison. Since c is between
13572 '0' and '9', casting to int is safe. */
13573 if (width > (PY_SSIZE_T_MAX - ((int)c - '0')) / 10) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013574 PyErr_SetString(PyExc_ValueError,
13575 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013576 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013577 }
13578 width = width*10 + (c - '0');
13579 }
13580 }
13581 if (c == '.') {
13582 prec = 0;
13583 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013584 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013585 if (c == '*') {
13586 v = getnextarg(args, arglen, &argidx);
13587 if (v == NULL)
13588 goto onError;
13589 if (!PyLong_Check(v)) {
13590 PyErr_SetString(PyExc_TypeError,
13591 "* wants int");
13592 goto onError;
13593 }
13594 prec = PyLong_AsLong(v);
13595 if (prec == -1 && PyErr_Occurred())
13596 goto onError;
13597 if (prec < 0)
13598 prec = 0;
13599 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013600 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013601 }
13602 else if (c >= '0' && c <= '9') {
13603 prec = c - '0';
13604 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013605 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013606 if (c < '0' || c > '9')
13607 break;
Martin v. Löwisb05c0732012-05-15 13:45:49 +020013608 if (prec > (INT_MAX - ((int)c - '0')) / 10) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013609 PyErr_SetString(PyExc_ValueError,
13610 "prec too big");
13611 goto onError;
13612 }
13613 prec = prec*10 + (c - '0');
13614 }
13615 }
13616 } /* prec */
13617 if (fmtcnt >= 0) {
13618 if (c == 'h' || c == 'l' || c == 'L') {
13619 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013620 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013621 }
13622 }
13623 if (fmtcnt < 0) {
13624 PyErr_SetString(PyExc_ValueError,
13625 "incomplete format");
13626 goto onError;
13627 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020013628 if (fmtcnt == 0)
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013629 writer.overallocate = 0;
Victor Stinneraff3cc62012-04-30 05:19:21 +020013630
13631 if (c == '%') {
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013632 if (_PyUnicodeWriter_Prepare(&writer, 1, '%') == -1)
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013633 goto onError;
Victor Stinneree4544c2012-05-09 22:24:08 +020013634 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '%');
13635 writer.pos += 1;
Victor Stinneraff3cc62012-04-30 05:19:21 +020013636 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013637 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020013638
Victor Stinneraff3cc62012-04-30 05:19:21 +020013639 v = getnextarg(args, arglen, &argidx);
13640 if (v == NULL)
13641 goto onError;
13642
Benjamin Peterson29060642009-01-31 22:14:21 +000013643 sign = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013644 signchar = '\0';
Benjamin Peterson29060642009-01-31 22:14:21 +000013645 fill = ' ';
13646 switch (c) {
13647
Benjamin Peterson29060642009-01-31 22:14:21 +000013648 case 's':
13649 case 'r':
13650 case 'a':
Victor Stinnerd3f08822012-05-29 12:57:52 +020013651 if (PyLong_CheckExact(v) && width == -1 && prec == -1) {
13652 /* Fast path */
13653 if (_PyLong_FormatWriter(&writer, v, 10, flags & F_ALT) == -1)
13654 goto onError;
13655 goto nextarg;
13656 }
13657
Victor Stinner808fc0a2010-03-22 12:50:40 +000013658 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013659 temp = v;
13660 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013661 }
13662 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013663 if (c == 's')
13664 temp = PyObject_Str(v);
13665 else if (c == 'r')
13666 temp = PyObject_Repr(v);
13667 else
13668 temp = PyObject_ASCII(v);
Benjamin Peterson29060642009-01-31 22:14:21 +000013669 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013670 break;
13671
13672 case 'i':
13673 case 'd':
13674 case 'u':
13675 case 'o':
13676 case 'x':
13677 case 'X':
Victor Stinnerd3f08822012-05-29 12:57:52 +020013678 if (PyLong_CheckExact(v)
13679 && width == -1 && prec == -1
13680 && !(flags & (F_SIGN | F_BLANK)))
13681 {
13682 /* Fast path */
13683 switch(c)
13684 {
13685 case 'd':
13686 case 'i':
13687 case 'u':
13688 if (_PyLong_FormatWriter(&writer, v, 10, flags & F_ALT) == -1)
13689 goto onError;
13690 goto nextarg;
13691 case 'x':
13692 if (_PyLong_FormatWriter(&writer, v, 16, flags & F_ALT) == -1)
13693 goto onError;
13694 goto nextarg;
13695 case 'o':
13696 if (_PyLong_FormatWriter(&writer, v, 8, flags & F_ALT) == -1)
13697 goto onError;
13698 goto nextarg;
13699 default:
13700 break;
13701 }
13702 }
13703
Benjamin Peterson29060642009-01-31 22:14:21 +000013704 isnumok = 0;
13705 if (PyNumber_Check(v)) {
13706 PyObject *iobj=NULL;
13707
13708 if (PyLong_Check(v)) {
13709 iobj = v;
13710 Py_INCREF(iobj);
13711 }
13712 else {
13713 iobj = PyNumber_Long(v);
13714 }
13715 if (iobj!=NULL) {
13716 if (PyLong_Check(iobj)) {
13717 isnumok = 1;
Victor Stinneraff3cc62012-04-30 05:19:21 +020013718 sign = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013719 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013720 Py_DECREF(iobj);
Benjamin Peterson29060642009-01-31 22:14:21 +000013721 }
13722 else {
13723 Py_DECREF(iobj);
13724 }
13725 }
13726 }
13727 if (!isnumok) {
13728 PyErr_Format(PyExc_TypeError,
13729 "%%%c format: a number is required, "
13730 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13731 goto onError;
13732 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013733 if (flags & F_ZERO)
Benjamin Peterson29060642009-01-31 22:14:21 +000013734 fill = '0';
13735 break;
13736
13737 case 'e':
13738 case 'E':
13739 case 'f':
13740 case 'F':
13741 case 'g':
13742 case 'G':
Victor Stinnerd3f08822012-05-29 12:57:52 +020013743 if (width == -1 && prec == -1
13744 && !(flags & (F_SIGN | F_BLANK)))
13745 {
13746 /* Fast path */
13747 if (formatfloat(v, flags, prec, c, NULL, &writer) == -1)
13748 goto onError;
13749 goto nextarg;
13750 }
13751
Benjamin Peterson29060642009-01-31 22:14:21 +000013752 sign = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013753 if (flags & F_ZERO)
Benjamin Peterson29060642009-01-31 22:14:21 +000013754 fill = '0';
Victor Stinnerd3f08822012-05-29 12:57:52 +020013755 if (formatfloat(v, flags, prec, c, &temp, NULL) == -1)
13756 temp = NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000013757 break;
13758
13759 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013760 {
13761 Py_UCS4 ch = formatchar(v);
13762 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013763 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013764 if (width == -1 && prec == -1) {
13765 /* Fast path */
13766 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
13767 goto onError;
13768 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
13769 writer.pos += 1;
13770 goto nextarg;
13771 }
Victor Stinnerb5c3ea32012-05-02 00:29:36 +020013772 temp = PyUnicode_FromOrdinal(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +000013773 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013774 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013775
13776 default:
13777 PyErr_Format(PyExc_ValueError,
13778 "unsupported format character '%c' (0x%x) "
13779 "at index %zd",
13780 (31<=c && c<=126) ? (char)c : '?',
13781 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013782 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013783 goto onError;
13784 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020013785 if (temp == NULL)
13786 goto onError;
13787 assert (PyUnicode_Check(temp));
Victor Stinnerd3f08822012-05-29 12:57:52 +020013788
13789 if (width == -1 && prec == -1
13790 && !(flags & (F_SIGN | F_BLANK)))
13791 {
13792 /* Fast path */
13793 if (_PyUnicodeWriter_WriteStr(&writer, temp) == -1)
13794 goto onError;
13795 goto nextarg;
13796 }
13797
Victor Stinneraff3cc62012-04-30 05:19:21 +020013798 if (PyUnicode_READY(temp) == -1) {
13799 Py_CLEAR(temp);
13800 goto onError;
13801 }
13802 kind = PyUnicode_KIND(temp);
13803 pbuf = PyUnicode_DATA(temp);
13804 len = PyUnicode_GET_LENGTH(temp);
13805
13806 if (c == 's' || c == 'r' || c == 'a') {
13807 if (prec >= 0 && len > prec)
13808 len = prec;
13809 }
13810
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013811 /* pbuf is initialized here. */
13812 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013813 if (sign) {
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013814 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
13815 if (ch == '-' || ch == '+') {
13816 signchar = ch;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013817 len--;
13818 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013819 }
13820 else if (flags & F_SIGN)
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013821 signchar = '+';
Benjamin Peterson29060642009-01-31 22:14:21 +000013822 else if (flags & F_BLANK)
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013823 signchar = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +000013824 else
13825 sign = 0;
13826 }
13827 if (width < len)
13828 width = len;
Victor Stinneree4544c2012-05-09 22:24:08 +020013829
13830 /* Compute the length and maximum character of the
13831 written characters */
13832 bufmaxchar = 127;
13833 if (!(flags & F_LJUST)) {
13834 if (sign) {
13835 if ((width-1) > len)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013836 bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill);
Victor Stinneree4544c2012-05-09 22:24:08 +020013837 }
13838 else {
13839 if (width > len)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013840 bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill);
Victor Stinneree4544c2012-05-09 22:24:08 +020013841 }
13842 }
13843 maxchar = _PyUnicode_FindMaxChar(temp, 0, pindex+len);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013844 bufmaxchar = MAX_MAXCHAR(bufmaxchar, maxchar);
Victor Stinneree4544c2012-05-09 22:24:08 +020013845
13846 buflen = width;
13847 if (sign && len == width)
13848 buflen++;
13849
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013850 if (_PyUnicodeWriter_Prepare(&writer, buflen, bufmaxchar) == -1)
Victor Stinneree4544c2012-05-09 22:24:08 +020013851 goto onError;
13852
13853 /* Write characters */
Benjamin Peterson29060642009-01-31 22:14:21 +000013854 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013855 if (fill != ' ') {
Victor Stinneree4544c2012-05-09 22:24:08 +020013856 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, signchar);
13857 writer.pos += 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013858 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013859 if (width > len)
13860 width--;
13861 }
13862 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013863 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013864 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013865 if (fill != ' ') {
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013866 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '0');
13867 PyUnicode_WRITE(writer.kind, writer.data, writer.pos+1, c);
13868 writer.pos += 2;
13869 pindex += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +000013870 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013871 width -= 2;
13872 if (width < 0)
13873 width = 0;
13874 len -= 2;
13875 }
13876 if (width > len && !(flags & F_LJUST)) {
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013877 sublen = width - len;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013878 FILL(writer.kind, writer.data, fill, writer.pos, sublen);
13879 writer.pos += sublen;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013880 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013881 }
13882 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013883 if (sign) {
Victor Stinneree4544c2012-05-09 22:24:08 +020013884 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, signchar);
13885 writer.pos += 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013886 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013887 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013888 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13889 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013890 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '0');
13891 PyUnicode_WRITE(writer.kind, writer.data, writer.pos+1, c);
13892 writer.pos += 2;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013893 pindex += 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013894 }
13895 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013896
Victor Stinnerc9d369f2012-06-16 02:22:37 +020013897 if (len) {
13898 _PyUnicode_FastCopyCharacters(writer.buffer, writer.pos,
13899 temp, pindex, len);
13900 writer.pos += len;
13901 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013902 if (width > len) {
Victor Stinneree4544c2012-05-09 22:24:08 +020013903 sublen = width - len;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013904 FILL(writer.kind, writer.data, ' ', writer.pos, sublen);
13905 writer.pos += sublen;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013906 }
Victor Stinneree4544c2012-05-09 22:24:08 +020013907
Victor Stinnerd3f08822012-05-29 12:57:52 +020013908nextarg:
Benjamin Peterson29060642009-01-31 22:14:21 +000013909 if (dict && (argidx < arglen) && c != '%') {
13910 PyErr_SetString(PyExc_TypeError,
13911 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013912 goto onError;
13913 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013914 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013915 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013916 } /* until end */
13917 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013918 PyErr_SetString(PyExc_TypeError,
13919 "not all arguments converted during string formatting");
13920 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013921 }
13922
13923 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013924 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013925 }
13926 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013927 Py_XDECREF(temp);
13928 Py_XDECREF(second);
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013929 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013930
Benjamin Peterson29060642009-01-31 22:14:21 +000013931 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013932 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013933 Py_XDECREF(temp);
13934 Py_XDECREF(second);
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013935 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013936 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013937 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013938 }
13939 return NULL;
13940}
13941
Jeremy Hylton938ace62002-07-17 16:30:39 +000013942static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013943unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13944
Tim Peters6d6c1a32001-08-02 04:15:00 +000013945static PyObject *
13946unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13947{
Benjamin Peterson29060642009-01-31 22:14:21 +000013948 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013949 static char *kwlist[] = {"object", "encoding", "errors", 0};
13950 char *encoding = NULL;
13951 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013952
Benjamin Peterson14339b62009-01-31 16:36:08 +000013953 if (type != &PyUnicode_Type)
13954 return unicode_subtype_new(type, args, kwds);
13955 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013956 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013957 return NULL;
Victor Stinner382955f2011-12-11 21:44:00 +010013958 if (x == NULL) {
13959 Py_INCREF(unicode_empty);
13960 return unicode_empty;
13961 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013962 if (encoding == NULL && errors == NULL)
13963 return PyObject_Str(x);
13964 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013965 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013966}
13967
Guido van Rossume023fe02001-08-30 03:12:59 +000013968static PyObject *
13969unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13970{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013971 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013972 Py_ssize_t length, char_size;
13973 int share_wstr, share_utf8;
13974 unsigned int kind;
13975 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013976
Benjamin Peterson14339b62009-01-31 16:36:08 +000013977 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013978
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013979 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013980 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013981 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013982 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050013983 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060013984 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013985 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060013986 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013987
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013988 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013989 if (self == NULL) {
13990 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013991 return NULL;
13992 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013993 kind = PyUnicode_KIND(unicode);
13994 length = PyUnicode_GET_LENGTH(unicode);
13995
13996 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013997#ifdef Py_DEBUG
13998 _PyUnicode_HASH(self) = -1;
13999#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014000 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014001#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014002 _PyUnicode_STATE(self).interned = 0;
14003 _PyUnicode_STATE(self).kind = kind;
14004 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014005 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014006 _PyUnicode_STATE(self).ready = 1;
14007 _PyUnicode_WSTR(self) = NULL;
14008 _PyUnicode_UTF8_LENGTH(self) = 0;
14009 _PyUnicode_UTF8(self) = NULL;
14010 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014011 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014012
14013 share_utf8 = 0;
14014 share_wstr = 0;
14015 if (kind == PyUnicode_1BYTE_KIND) {
14016 char_size = 1;
14017 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14018 share_utf8 = 1;
14019 }
14020 else if (kind == PyUnicode_2BYTE_KIND) {
14021 char_size = 2;
14022 if (sizeof(wchar_t) == 2)
14023 share_wstr = 1;
14024 }
14025 else {
14026 assert(kind == PyUnicode_4BYTE_KIND);
14027 char_size = 4;
14028 if (sizeof(wchar_t) == 4)
14029 share_wstr = 1;
14030 }
14031
14032 /* Ensure we won't overflow the length. */
14033 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14034 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014035 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014036 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014037 data = PyObject_MALLOC((length + 1) * char_size);
14038 if (data == NULL) {
14039 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014040 goto onError;
14041 }
14042
Victor Stinnerc3c74152011-10-02 20:39:55 +020014043 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014044 if (share_utf8) {
14045 _PyUnicode_UTF8_LENGTH(self) = length;
14046 _PyUnicode_UTF8(self) = data;
14047 }
14048 if (share_wstr) {
14049 _PyUnicode_WSTR_LENGTH(self) = length;
14050 _PyUnicode_WSTR(self) = (wchar_t *)data;
14051 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014052
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014053 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014054 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014055 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014056#ifdef Py_DEBUG
14057 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14058#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014059 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014060 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014061
14062onError:
14063 Py_DECREF(unicode);
14064 Py_DECREF(self);
14065 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014066}
14067
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014068PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000014069 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014070\n\
Collin Winterd474ce82007-08-07 19:42:11 +000014071Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000014072encoding defaults to the current default string encoding.\n\
14073errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014074
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014075static PyObject *unicode_iter(PyObject *seq);
14076
Guido van Rossumd57fd912000-03-10 22:53:23 +000014077PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014078 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014079 "str", /* tp_name */
14080 sizeof(PyUnicodeObject), /* tp_size */
14081 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014082 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014083 (destructor)unicode_dealloc, /* tp_dealloc */
14084 0, /* tp_print */
14085 0, /* tp_getattr */
14086 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014087 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014088 unicode_repr, /* tp_repr */
14089 &unicode_as_number, /* tp_as_number */
14090 &unicode_as_sequence, /* tp_as_sequence */
14091 &unicode_as_mapping, /* tp_as_mapping */
14092 (hashfunc) unicode_hash, /* tp_hash*/
14093 0, /* tp_call*/
14094 (reprfunc) unicode_str, /* tp_str */
14095 PyObject_GenericGetAttr, /* tp_getattro */
14096 0, /* tp_setattro */
14097 0, /* tp_as_buffer */
14098 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014099 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014100 unicode_doc, /* tp_doc */
14101 0, /* tp_traverse */
14102 0, /* tp_clear */
14103 PyUnicode_RichCompare, /* tp_richcompare */
14104 0, /* tp_weaklistoffset */
14105 unicode_iter, /* tp_iter */
14106 0, /* tp_iternext */
14107 unicode_methods, /* tp_methods */
14108 0, /* tp_members */
14109 0, /* tp_getset */
14110 &PyBaseObject_Type, /* tp_base */
14111 0, /* tp_dict */
14112 0, /* tp_descr_get */
14113 0, /* tp_descr_set */
14114 0, /* tp_dictoffset */
14115 0, /* tp_init */
14116 0, /* tp_alloc */
14117 unicode_new, /* tp_new */
14118 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014119};
14120
14121/* Initialize the Unicode implementation */
14122
Victor Stinner3a50e702011-10-18 21:21:00 +020014123int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014124{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014125 int i;
14126
Thomas Wouters477c8d52006-05-27 19:21:47 +000014127 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014128 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014129 0x000A, /* LINE FEED */
14130 0x000D, /* CARRIAGE RETURN */
14131 0x001C, /* FILE SEPARATOR */
14132 0x001D, /* GROUP SEPARATOR */
14133 0x001E, /* RECORD SEPARATOR */
14134 0x0085, /* NEXT LINE */
14135 0x2028, /* LINE SEPARATOR */
14136 0x2029, /* PARAGRAPH SEPARATOR */
14137 };
14138
Fred Drakee4315f52000-05-09 19:53:39 +000014139 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020014140 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014141 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014142 Py_FatalError("Can't create empty string");
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010014143 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014144
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014145 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000014146 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000014147 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014148 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014149
14150 /* initialize the linebreak bloom filter */
14151 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014152 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014153 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014154
14155 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020014156
14157#ifdef HAVE_MBCS
14158 winver.dwOSVersionInfoSize = sizeof(winver);
14159 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14160 PyErr_SetFromWindowsErr(0);
14161 return -1;
14162 }
14163#endif
14164 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014165}
14166
14167/* Finalize the Unicode implementation */
14168
Christian Heimesa156e092008-02-16 07:38:31 +000014169int
14170PyUnicode_ClearFreeList(void)
14171{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014172 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014173}
14174
Guido van Rossumd57fd912000-03-10 22:53:23 +000014175void
Thomas Wouters78890102000-07-22 19:25:51 +000014176_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014177{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014178 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014179
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000014180 Py_XDECREF(unicode_empty);
14181 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014182
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014183 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014184 if (unicode_latin1[i]) {
14185 Py_DECREF(unicode_latin1[i]);
14186 unicode_latin1[i] = NULL;
14187 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014188 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014189 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014190 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014191}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014192
Walter Dörwald16807132007-05-25 13:52:07 +000014193void
14194PyUnicode_InternInPlace(PyObject **p)
14195{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014196 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014197 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014198#ifdef Py_DEBUG
14199 assert(s != NULL);
14200 assert(_PyUnicode_CHECK(s));
14201#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014202 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014203 return;
14204#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014205 /* If it's a subclass, we don't really know what putting
14206 it in the interned dict might do. */
14207 if (!PyUnicode_CheckExact(s))
14208 return;
14209 if (PyUnicode_CHECK_INTERNED(s))
14210 return;
14211 if (interned == NULL) {
14212 interned = PyDict_New();
14213 if (interned == NULL) {
14214 PyErr_Clear(); /* Don't leave an exception */
14215 return;
14216 }
14217 }
14218 /* It might be that the GetItem call fails even
14219 though the key is present in the dictionary,
14220 namely when this happens during a stack overflow. */
14221 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014222 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014223 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014224
Benjamin Peterson29060642009-01-31 22:14:21 +000014225 if (t) {
14226 Py_INCREF(t);
14227 Py_DECREF(*p);
14228 *p = t;
14229 return;
14230 }
Walter Dörwald16807132007-05-25 13:52:07 +000014231
Benjamin Peterson14339b62009-01-31 16:36:08 +000014232 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010014233 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014234 PyErr_Clear();
14235 PyThreadState_GET()->recursion_critical = 0;
14236 return;
14237 }
14238 PyThreadState_GET()->recursion_critical = 0;
14239 /* The two references in interned are not counted by refcnt.
14240 The deallocator will take care of this */
14241 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014242 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014243}
14244
14245void
14246PyUnicode_InternImmortal(PyObject **p)
14247{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014248 PyUnicode_InternInPlace(p);
14249 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014250 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014251 Py_INCREF(*p);
14252 }
Walter Dörwald16807132007-05-25 13:52:07 +000014253}
14254
14255PyObject *
14256PyUnicode_InternFromString(const char *cp)
14257{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014258 PyObject *s = PyUnicode_FromString(cp);
14259 if (s == NULL)
14260 return NULL;
14261 PyUnicode_InternInPlace(&s);
14262 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014263}
14264
Alexander Belopolsky40018472011-02-26 01:02:56 +000014265void
14266_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014267{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014268 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014269 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014270 Py_ssize_t i, n;
14271 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014272
Benjamin Peterson14339b62009-01-31 16:36:08 +000014273 if (interned == NULL || !PyDict_Check(interned))
14274 return;
14275 keys = PyDict_Keys(interned);
14276 if (keys == NULL || !PyList_Check(keys)) {
14277 PyErr_Clear();
14278 return;
14279 }
Walter Dörwald16807132007-05-25 13:52:07 +000014280
Benjamin Peterson14339b62009-01-31 16:36:08 +000014281 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14282 detector, interned unicode strings are not forcibly deallocated;
14283 rather, we give them their stolen references back, and then clear
14284 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014285
Benjamin Peterson14339b62009-01-31 16:36:08 +000014286 n = PyList_GET_SIZE(keys);
14287 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014288 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014289 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014290 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014291 if (PyUnicode_READY(s) == -1) {
14292 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014293 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014294 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014295 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014296 case SSTATE_NOT_INTERNED:
14297 /* XXX Shouldn't happen */
14298 break;
14299 case SSTATE_INTERNED_IMMORTAL:
14300 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014301 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014302 break;
14303 case SSTATE_INTERNED_MORTAL:
14304 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014305 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014306 break;
14307 default:
14308 Py_FatalError("Inconsistent interned string state.");
14309 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014310 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014311 }
14312 fprintf(stderr, "total size of all interned strings: "
14313 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14314 "mortal/immortal\n", mortal_size, immortal_size);
14315 Py_DECREF(keys);
14316 PyDict_Clear(interned);
14317 Py_DECREF(interned);
14318 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000014319}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014320
14321
14322/********************* Unicode Iterator **************************/
14323
14324typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014325 PyObject_HEAD
14326 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014327 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014328} unicodeiterobject;
14329
14330static void
14331unicodeiter_dealloc(unicodeiterobject *it)
14332{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014333 _PyObject_GC_UNTRACK(it);
14334 Py_XDECREF(it->it_seq);
14335 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014336}
14337
14338static int
14339unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14340{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014341 Py_VISIT(it->it_seq);
14342 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014343}
14344
14345static PyObject *
14346unicodeiter_next(unicodeiterobject *it)
14347{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014348 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014349
Benjamin Peterson14339b62009-01-31 16:36:08 +000014350 assert(it != NULL);
14351 seq = it->it_seq;
14352 if (seq == NULL)
14353 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014354 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014355
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014356 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14357 int kind = PyUnicode_KIND(seq);
14358 void *data = PyUnicode_DATA(seq);
14359 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14360 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014361 if (item != NULL)
14362 ++it->it_index;
14363 return item;
14364 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014365
Benjamin Peterson14339b62009-01-31 16:36:08 +000014366 Py_DECREF(seq);
14367 it->it_seq = NULL;
14368 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014369}
14370
14371static PyObject *
14372unicodeiter_len(unicodeiterobject *it)
14373{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014374 Py_ssize_t len = 0;
14375 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014376 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014377 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014378}
14379
14380PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14381
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014382static PyObject *
14383unicodeiter_reduce(unicodeiterobject *it)
14384{
14385 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020014386 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014387 it->it_seq, it->it_index);
14388 } else {
14389 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
14390 if (u == NULL)
14391 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020014392 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014393 }
14394}
14395
14396PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14397
14398static PyObject *
14399unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
14400{
14401 Py_ssize_t index = PyLong_AsSsize_t(state);
14402 if (index == -1 && PyErr_Occurred())
14403 return NULL;
14404 if (index < 0)
14405 index = 0;
14406 it->it_index = index;
14407 Py_RETURN_NONE;
14408}
14409
14410PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14411
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014412static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014413 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014414 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014415 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
14416 reduce_doc},
14417 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
14418 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014419 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014420};
14421
14422PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014423 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14424 "str_iterator", /* tp_name */
14425 sizeof(unicodeiterobject), /* tp_basicsize */
14426 0, /* tp_itemsize */
14427 /* methods */
14428 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14429 0, /* tp_print */
14430 0, /* tp_getattr */
14431 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014432 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014433 0, /* tp_repr */
14434 0, /* tp_as_number */
14435 0, /* tp_as_sequence */
14436 0, /* tp_as_mapping */
14437 0, /* tp_hash */
14438 0, /* tp_call */
14439 0, /* tp_str */
14440 PyObject_GenericGetAttr, /* tp_getattro */
14441 0, /* tp_setattro */
14442 0, /* tp_as_buffer */
14443 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14444 0, /* tp_doc */
14445 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14446 0, /* tp_clear */
14447 0, /* tp_richcompare */
14448 0, /* tp_weaklistoffset */
14449 PyObject_SelfIter, /* tp_iter */
14450 (iternextfunc)unicodeiter_next, /* tp_iternext */
14451 unicodeiter_methods, /* tp_methods */
14452 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014453};
14454
14455static PyObject *
14456unicode_iter(PyObject *seq)
14457{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014458 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014459
Benjamin Peterson14339b62009-01-31 16:36:08 +000014460 if (!PyUnicode_Check(seq)) {
14461 PyErr_BadInternalCall();
14462 return NULL;
14463 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014464 if (PyUnicode_READY(seq) == -1)
14465 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014466 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14467 if (it == NULL)
14468 return NULL;
14469 it->it_index = 0;
14470 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014471 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014472 _PyObject_GC_TRACK(it);
14473 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014474}
14475
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014476
14477size_t
14478Py_UNICODE_strlen(const Py_UNICODE *u)
14479{
14480 int res = 0;
14481 while(*u++)
14482 res++;
14483 return res;
14484}
14485
14486Py_UNICODE*
14487Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14488{
14489 Py_UNICODE *u = s1;
14490 while ((*u++ = *s2++));
14491 return s1;
14492}
14493
14494Py_UNICODE*
14495Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14496{
14497 Py_UNICODE *u = s1;
14498 while ((*u++ = *s2++))
14499 if (n-- == 0)
14500 break;
14501 return s1;
14502}
14503
14504Py_UNICODE*
14505Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14506{
14507 Py_UNICODE *u1 = s1;
14508 u1 += Py_UNICODE_strlen(u1);
14509 Py_UNICODE_strcpy(u1, s2);
14510 return s1;
14511}
14512
14513int
14514Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14515{
14516 while (*s1 && *s2 && *s1 == *s2)
14517 s1++, s2++;
14518 if (*s1 && *s2)
14519 return (*s1 < *s2) ? -1 : +1;
14520 if (*s1)
14521 return 1;
14522 if (*s2)
14523 return -1;
14524 return 0;
14525}
14526
14527int
14528Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14529{
14530 register Py_UNICODE u1, u2;
14531 for (; n != 0; n--) {
14532 u1 = *s1;
14533 u2 = *s2;
14534 if (u1 != u2)
14535 return (u1 < u2) ? -1 : +1;
14536 if (u1 == '\0')
14537 return 0;
14538 s1++;
14539 s2++;
14540 }
14541 return 0;
14542}
14543
14544Py_UNICODE*
14545Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14546{
14547 const Py_UNICODE *p;
14548 for (p = s; *p; p++)
14549 if (*p == c)
14550 return (Py_UNICODE*)p;
14551 return NULL;
14552}
14553
14554Py_UNICODE*
14555Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14556{
14557 const Py_UNICODE *p;
14558 p = s + Py_UNICODE_strlen(s);
14559 while (p != s) {
14560 p--;
14561 if (*p == c)
14562 return (Py_UNICODE*)p;
14563 }
14564 return NULL;
14565}
Victor Stinner331ea922010-08-10 16:37:20 +000014566
Victor Stinner71133ff2010-09-01 23:43:53 +000014567Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014568PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014569{
Victor Stinner577db2c2011-10-11 22:12:48 +020014570 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014571 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014572
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014573 if (!PyUnicode_Check(unicode)) {
14574 PyErr_BadArgument();
14575 return NULL;
14576 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014577 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014578 if (u == NULL)
14579 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014580 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014581 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014582 PyErr_NoMemory();
14583 return NULL;
14584 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014585 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014586 size *= sizeof(Py_UNICODE);
14587 copy = PyMem_Malloc(size);
14588 if (copy == NULL) {
14589 PyErr_NoMemory();
14590 return NULL;
14591 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014592 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014593 return copy;
14594}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014595
Georg Brandl66c221e2010-10-14 07:04:07 +000014596/* A _string module, to export formatter_parser and formatter_field_name_split
14597 to the string.Formatter class implemented in Python. */
14598
14599static PyMethodDef _string_methods[] = {
14600 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14601 METH_O, PyDoc_STR("split the argument as a field name")},
14602 {"formatter_parser", (PyCFunction) formatter_parser,
14603 METH_O, PyDoc_STR("parse the argument as a format string")},
14604 {NULL, NULL}
14605};
14606
14607static struct PyModuleDef _string_module = {
14608 PyModuleDef_HEAD_INIT,
14609 "_string",
14610 PyDoc_STR("string helper module"),
14611 0,
14612 _string_methods,
14613 NULL,
14614 NULL,
14615 NULL,
14616 NULL
14617};
14618
14619PyMODINIT_FUNC
14620PyInit__string(void)
14621{
14622 return PyModule_Create(&_string_module);
14623}
14624
14625
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014626#ifdef __cplusplus
14627}
14628#endif