blob: 353d2bbf2fa64e992a238338e29c795c91d39e35 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Endianness switches; defaults to little endian */
51
52#ifdef WORDS_BIGENDIAN
53# define BYTEORDER_IS_BIG_ENDIAN
54#else
55# define BYTEORDER_IS_LITTLE_ENDIAN
56#endif
57
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000058/* --- Globals ------------------------------------------------------------
59
60 The globals are initialized by the _PyUnicode_Init() API and should
61 not be used before calling that API.
62
63*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000064
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000065
66#ifdef __cplusplus
67extern "C" {
68#endif
69
Victor Stinner8faf8212011-12-08 22:14:11 +010070/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
71#define MAX_UNICODE 0x10ffff
72
Victor Stinner910337b2011-10-03 03:20:16 +020073#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020074# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020075#else
76# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
77#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020078
Victor Stinnere90fe6a2011-10-01 16:48:13 +020079#define _PyUnicode_UTF8(op) \
80 (((PyCompactUnicodeObject*)(op))->utf8)
81#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020082 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020083 assert(PyUnicode_IS_READY(op)), \
84 PyUnicode_IS_COMPACT_ASCII(op) ? \
85 ((char*)((PyASCIIObject*)(op) + 1)) : \
86 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020087#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020088 (((PyCompactUnicodeObject*)(op))->utf8_length)
89#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020090 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020091 assert(PyUnicode_IS_READY(op)), \
92 PyUnicode_IS_COMPACT_ASCII(op) ? \
93 ((PyASCIIObject*)(op))->length : \
94 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020095#define _PyUnicode_WSTR(op) \
96 (((PyASCIIObject*)(op))->wstr)
97#define _PyUnicode_WSTR_LENGTH(op) \
98 (((PyCompactUnicodeObject*)(op))->wstr_length)
99#define _PyUnicode_LENGTH(op) \
100 (((PyASCIIObject *)(op))->length)
101#define _PyUnicode_STATE(op) \
102 (((PyASCIIObject *)(op))->state)
103#define _PyUnicode_HASH(op) \
104 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200105#define _PyUnicode_KIND(op) \
106 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200107 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200108#define _PyUnicode_GET_LENGTH(op) \
109 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200110 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200111#define _PyUnicode_DATA_ANY(op) \
112 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200113
Victor Stinnere6abb482012-05-02 01:15:40 +0200114/* Optimized version of Py_MAX() to compute the maximum character:
115 use it when your are computing the second argument of PyUnicode_New() */
116#define MAX_MAXCHAR(maxchar1, maxchar2) \
117 ((maxchar1) | (maxchar2))
118
Victor Stinner910337b2011-10-03 03:20:16 +0200119#undef PyUnicode_READY
120#define PyUnicode_READY(op) \
121 (assert(_PyUnicode_CHECK(op)), \
122 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200123 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100124 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200125
Victor Stinnerc379ead2011-10-03 12:52:27 +0200126#define _PyUnicode_SHARE_UTF8(op) \
127 (assert(_PyUnicode_CHECK(op)), \
128 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
129 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
130#define _PyUnicode_SHARE_WSTR(op) \
131 (assert(_PyUnicode_CHECK(op)), \
132 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
133
Victor Stinner829c0ad2011-10-03 01:08:02 +0200134/* true if the Unicode object has an allocated UTF-8 memory block
135 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200136#define _PyUnicode_HAS_UTF8_MEMORY(op) \
137 (assert(_PyUnicode_CHECK(op)), \
138 (!PyUnicode_IS_COMPACT_ASCII(op) \
139 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200140 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
141
Victor Stinner03490912011-10-03 23:45:12 +0200142/* true if the Unicode object has an allocated wstr memory block
143 (not shared with other data) */
144#define _PyUnicode_HAS_WSTR_MEMORY(op) \
145 (assert(_PyUnicode_CHECK(op)), \
146 (_PyUnicode_WSTR(op) && \
147 (!PyUnicode_IS_READY(op) || \
148 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
149
Victor Stinner910337b2011-10-03 03:20:16 +0200150/* Generic helper macro to convert characters of different types.
151 from_type and to_type have to be valid type names, begin and end
152 are pointers to the source characters which should be of type
153 "from_type *". to is a pointer of type "to_type *" and points to the
154 buffer where the result characters are written to. */
155#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
156 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200157 to_type *_to = (to_type *) to; \
158 const from_type *_iter = (begin); \
159 const from_type *_end = (end); \
160 Py_ssize_t n = (_end) - (_iter); \
161 const from_type *_unrolled_end = \
162 _iter + (n & ~ (Py_ssize_t) 3); \
163 while (_iter < (_unrolled_end)) { \
164 _to[0] = (to_type) _iter[0]; \
165 _to[1] = (to_type) _iter[1]; \
166 _to[2] = (to_type) _iter[2]; \
167 _to[3] = (to_type) _iter[3]; \
168 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200169 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200170 while (_iter < (_end)) \
171 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200172 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200173
Walter Dörwald16807132007-05-25 13:52:07 +0000174/* This dictionary holds all interned unicode strings. Note that references
175 to strings in this dictionary are *not* counted in the string's ob_refcnt.
176 When the interned string reaches a refcnt of 0 the string deallocation
177 function will delete the reference from this dictionary.
178
179 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000180 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000181*/
182static PyObject *interned;
183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200185static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000186
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200187/* List of static strings. */
188static _Py_Identifier *static_strings;
189
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190/* Single character Unicode strings in the Latin-1 range are being
191 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200192static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193
Christian Heimes190d79e2008-01-30 11:58:22 +0000194/* Fast detection of the most frequent whitespace characters */
195const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000196 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000197/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000198/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000199/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000200/* case 0x000C: * FORM FEED */
201/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000202 0, 1, 1, 1, 1, 1, 0, 0,
203 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000204/* case 0x001C: * FILE SEPARATOR */
205/* case 0x001D: * GROUP SEPARATOR */
206/* case 0x001E: * RECORD SEPARATOR */
207/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000208 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000209/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 1, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000214
Benjamin Peterson14339b62009-01-31 16:36:08 +0000215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
218 0, 0, 0, 0, 0, 0, 0, 0,
219 0, 0, 0, 0, 0, 0, 0, 0,
220 0, 0, 0, 0, 0, 0, 0, 0,
221 0, 0, 0, 0, 0, 0, 0, 0,
222 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000223};
224
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200225/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200226static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200227static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100228static int unicode_modifiable(PyObject *unicode);
229
Victor Stinnerfe226c02011-10-03 03:52:20 +0200230
Alexander Belopolsky40018472011-02-26 01:02:56 +0000231static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200232_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
233static PyObject *
234_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
235static PyObject *
236_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
237
238static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000239unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000240 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100241 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000242 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
243
Alexander Belopolsky40018472011-02-26 01:02:56 +0000244static void
245raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300246 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100247 PyObject *unicode,
248 Py_ssize_t startpos, Py_ssize_t endpos,
249 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000250
Christian Heimes190d79e2008-01-30 11:58:22 +0000251/* Same for linebreaks */
252static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000253 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000254/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000255/* 0x000B, * LINE TABULATION */
256/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000257/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000258 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000259 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000260/* 0x001C, * FILE SEPARATOR */
261/* 0x001D, * GROUP SEPARATOR */
262/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000263 0, 0, 0, 0, 1, 1, 1, 0,
264 0, 0, 0, 0, 0, 0, 0, 0,
265 0, 0, 0, 0, 0, 0, 0, 0,
266 0, 0, 0, 0, 0, 0, 0, 0,
267 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000268
Benjamin Peterson14339b62009-01-31 16:36:08 +0000269 0, 0, 0, 0, 0, 0, 0, 0,
270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000277};
278
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300279/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
280 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000281Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000282PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000283{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000284#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000285 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000286#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000287 /* This is actually an illegal character, so it should
288 not be passed to unichr. */
289 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000290#endif
291}
292
Victor Stinner910337b2011-10-03 03:20:16 +0200293#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200294int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100295_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200296{
297 PyASCIIObject *ascii;
298 unsigned int kind;
299
300 assert(PyUnicode_Check(op));
301
302 ascii = (PyASCIIObject *)op;
303 kind = ascii->state.kind;
304
Victor Stinnera3b334d2011-10-03 13:53:37 +0200305 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200306 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200307 assert(ascii->state.ready == 1);
308 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200309 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200310 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200311 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200312
Victor Stinnera41463c2011-10-04 01:05:08 +0200313 if (ascii->state.compact == 1) {
314 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200315 assert(kind == PyUnicode_1BYTE_KIND
316 || kind == PyUnicode_2BYTE_KIND
317 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200318 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200319 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200320 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100321 }
322 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200323 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
324
325 data = unicode->data.any;
326 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100327 assert(ascii->length == 0);
328 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200329 assert(ascii->state.compact == 0);
330 assert(ascii->state.ascii == 0);
331 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100332 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200333 assert(ascii->wstr != NULL);
334 assert(data == NULL);
335 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200336 }
337 else {
338 assert(kind == PyUnicode_1BYTE_KIND
339 || kind == PyUnicode_2BYTE_KIND
340 || kind == PyUnicode_4BYTE_KIND);
341 assert(ascii->state.compact == 0);
342 assert(ascii->state.ready == 1);
343 assert(data != NULL);
344 if (ascii->state.ascii) {
345 assert (compact->utf8 == data);
346 assert (compact->utf8_length == ascii->length);
347 }
348 else
349 assert (compact->utf8 != data);
350 }
351 }
352 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200353 if (
354#if SIZEOF_WCHAR_T == 2
355 kind == PyUnicode_2BYTE_KIND
356#else
357 kind == PyUnicode_4BYTE_KIND
358#endif
359 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200360 {
361 assert(ascii->wstr == data);
362 assert(compact->wstr_length == ascii->length);
363 } else
364 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200365 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200366
367 if (compact->utf8 == NULL)
368 assert(compact->utf8_length == 0);
369 if (ascii->wstr == NULL)
370 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200371 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200372 /* check that the best kind is used */
373 if (check_content && kind != PyUnicode_WCHAR_KIND)
374 {
375 Py_ssize_t i;
376 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200377 void *data;
378 Py_UCS4 ch;
379
380 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200381 for (i=0; i < ascii->length; i++)
382 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200383 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200384 if (ch > maxchar)
385 maxchar = ch;
386 }
387 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100388 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200389 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100390 assert(maxchar <= 255);
391 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200392 else
393 assert(maxchar < 128);
394 }
Victor Stinner77faf692011-11-20 18:56:05 +0100395 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200396 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100397 assert(maxchar <= 0xFFFF);
398 }
399 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200400 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100401 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100402 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200403 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200404 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400405 return 1;
406}
Victor Stinner910337b2011-10-03 03:20:16 +0200407#endif
408
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100409static PyObject*
410unicode_result_wchar(PyObject *unicode)
411{
412#ifndef Py_DEBUG
413 Py_ssize_t len;
414
415 assert(Py_REFCNT(unicode) == 1);
416
417 len = _PyUnicode_WSTR_LENGTH(unicode);
418 if (len == 0) {
419 Py_INCREF(unicode_empty);
420 Py_DECREF(unicode);
421 return unicode_empty;
422 }
423
424 if (len == 1) {
425 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
426 if (ch < 256) {
427 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
428 Py_DECREF(unicode);
429 return latin1_char;
430 }
431 }
432
433 if (_PyUnicode_Ready(unicode) < 0) {
434 Py_XDECREF(unicode);
435 return NULL;
436 }
437#else
438 /* don't make the result ready in debug mode to ensure that the caller
439 makes the string ready before using it */
440 assert(_PyUnicode_CheckConsistency(unicode, 1));
441#endif
442 return unicode;
443}
444
445static PyObject*
446unicode_result_ready(PyObject *unicode)
447{
448 Py_ssize_t length;
449
450 length = PyUnicode_GET_LENGTH(unicode);
451 if (length == 0) {
452 if (unicode != unicode_empty) {
453 Py_INCREF(unicode_empty);
454 Py_DECREF(unicode);
455 }
456 return unicode_empty;
457 }
458
459 if (length == 1) {
460 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
461 if (ch < 256) {
462 PyObject *latin1_char = unicode_latin1[ch];
463 if (latin1_char != NULL) {
464 if (unicode != latin1_char) {
465 Py_INCREF(latin1_char);
466 Py_DECREF(unicode);
467 }
468 return latin1_char;
469 }
470 else {
471 assert(_PyUnicode_CheckConsistency(unicode, 1));
472 Py_INCREF(unicode);
473 unicode_latin1[ch] = unicode;
474 return unicode;
475 }
476 }
477 }
478
479 assert(_PyUnicode_CheckConsistency(unicode, 1));
480 return unicode;
481}
482
483static PyObject*
484unicode_result(PyObject *unicode)
485{
486 assert(_PyUnicode_CHECK(unicode));
487 if (PyUnicode_IS_READY(unicode))
488 return unicode_result_ready(unicode);
489 else
490 return unicode_result_wchar(unicode);
491}
492
Victor Stinnerc4b49542011-12-11 22:44:26 +0100493static PyObject*
494unicode_result_unchanged(PyObject *unicode)
495{
496 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500497 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100498 return NULL;
499 Py_INCREF(unicode);
500 return unicode;
501 }
502 else
503 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100504 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100505}
506
Victor Stinner3a50e702011-10-18 21:21:00 +0200507#ifdef HAVE_MBCS
508static OSVERSIONINFOEX winver;
509#endif
510
Thomas Wouters477c8d52006-05-27 19:21:47 +0000511/* --- Bloom Filters ----------------------------------------------------- */
512
513/* stuff to implement simple "bloom filters" for Unicode characters.
514 to keep things simple, we use a single bitmask, using the least 5
515 bits from each unicode characters as the bit index. */
516
517/* the linebreak mask is set up by Unicode_Init below */
518
Antoine Pitrouf068f942010-01-13 14:19:12 +0000519#if LONG_BIT >= 128
520#define BLOOM_WIDTH 128
521#elif LONG_BIT >= 64
522#define BLOOM_WIDTH 64
523#elif LONG_BIT >= 32
524#define BLOOM_WIDTH 32
525#else
526#error "LONG_BIT is smaller than 32"
527#endif
528
Thomas Wouters477c8d52006-05-27 19:21:47 +0000529#define BLOOM_MASK unsigned long
530
531static BLOOM_MASK bloom_linebreak;
532
Antoine Pitrouf068f942010-01-13 14:19:12 +0000533#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
534#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000535
Benjamin Peterson29060642009-01-31 22:14:21 +0000536#define BLOOM_LINEBREAK(ch) \
537 ((ch) < 128U ? ascii_linebreak[(ch)] : \
538 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000539
Alexander Belopolsky40018472011-02-26 01:02:56 +0000540Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200541make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000542{
543 /* calculate simple bloom-style bitmask for a given unicode string */
544
Antoine Pitrouf068f942010-01-13 14:19:12 +0000545 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000546 Py_ssize_t i;
547
548 mask = 0;
549 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200550 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000551
552 return mask;
553}
554
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200555#define BLOOM_MEMBER(mask, chr, str) \
556 (BLOOM(mask, chr) \
557 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000558
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200559/* Compilation of templated routines */
560
561#include "stringlib/asciilib.h"
562#include "stringlib/fastsearch.h"
563#include "stringlib/partition.h"
564#include "stringlib/split.h"
565#include "stringlib/count.h"
566#include "stringlib/find.h"
567#include "stringlib/find_max_char.h"
568#include "stringlib/localeutil.h"
569#include "stringlib/undef.h"
570
571#include "stringlib/ucs1lib.h"
572#include "stringlib/fastsearch.h"
573#include "stringlib/partition.h"
574#include "stringlib/split.h"
575#include "stringlib/count.h"
576#include "stringlib/find.h"
577#include "stringlib/find_max_char.h"
578#include "stringlib/localeutil.h"
579#include "stringlib/undef.h"
580
581#include "stringlib/ucs2lib.h"
582#include "stringlib/fastsearch.h"
583#include "stringlib/partition.h"
584#include "stringlib/split.h"
585#include "stringlib/count.h"
586#include "stringlib/find.h"
587#include "stringlib/find_max_char.h"
588#include "stringlib/localeutil.h"
589#include "stringlib/undef.h"
590
591#include "stringlib/ucs4lib.h"
592#include "stringlib/fastsearch.h"
593#include "stringlib/partition.h"
594#include "stringlib/split.h"
595#include "stringlib/count.h"
596#include "stringlib/find.h"
597#include "stringlib/find_max_char.h"
598#include "stringlib/localeutil.h"
599#include "stringlib/undef.h"
600
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200601#include "stringlib/unicodedefs.h"
602#include "stringlib/fastsearch.h"
603#include "stringlib/count.h"
604#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100605#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200606
Guido van Rossumd57fd912000-03-10 22:53:23 +0000607/* --- Unicode Object ----------------------------------------------------- */
608
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200609static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200610fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200611
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200612Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
613 Py_ssize_t size, Py_UCS4 ch,
614 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200615{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200616 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
617
618 switch (kind) {
619 case PyUnicode_1BYTE_KIND:
620 {
621 Py_UCS1 ch1 = (Py_UCS1) ch;
622 if (ch1 == ch)
623 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
624 else
625 return -1;
626 }
627 case PyUnicode_2BYTE_KIND:
628 {
629 Py_UCS2 ch2 = (Py_UCS2) ch;
630 if (ch2 == ch)
631 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
632 else
633 return -1;
634 }
635 case PyUnicode_4BYTE_KIND:
636 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
637 default:
638 assert(0);
639 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200640 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200641}
642
Victor Stinnerfe226c02011-10-03 03:52:20 +0200643static PyObject*
644resize_compact(PyObject *unicode, Py_ssize_t length)
645{
646 Py_ssize_t char_size;
647 Py_ssize_t struct_size;
648 Py_ssize_t new_size;
649 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100650 PyObject *new_unicode;
Victor Stinner79891572012-05-03 13:43:07 +0200651 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200652 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100653 assert(PyUnicode_IS_COMPACT(unicode));
654
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200655 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100656 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200657 struct_size = sizeof(PyASCIIObject);
658 else
659 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200660 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200661
Victor Stinnerfe226c02011-10-03 03:52:20 +0200662 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
663 PyErr_NoMemory();
664 return NULL;
665 }
666 new_size = (struct_size + (length + 1) * char_size);
667
Victor Stinner84def372011-12-11 20:04:56 +0100668 _Py_DEC_REFTOTAL;
669 _Py_ForgetReference(unicode);
670
671 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
672 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100673 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200674 PyErr_NoMemory();
675 return NULL;
676 }
Victor Stinner84def372011-12-11 20:04:56 +0100677 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200678 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100679
Victor Stinnerfe226c02011-10-03 03:52:20 +0200680 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200681 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200682 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100683 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200684 _PyUnicode_WSTR_LENGTH(unicode) = length;
685 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200686 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
687 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200688 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200689 return unicode;
690}
691
Alexander Belopolsky40018472011-02-26 01:02:56 +0000692static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200693resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000694{
Victor Stinner95663112011-10-04 01:03:50 +0200695 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100696 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200697 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200698 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000699
Victor Stinnerfe226c02011-10-03 03:52:20 +0200700 if (PyUnicode_IS_READY(unicode)) {
701 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200702 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200703 void *data;
704
705 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200706 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200707 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
708 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200709
710 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
711 PyErr_NoMemory();
712 return -1;
713 }
714 new_size = (length + 1) * char_size;
715
Victor Stinner7a9105a2011-12-12 00:13:42 +0100716 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
717 {
718 PyObject_DEL(_PyUnicode_UTF8(unicode));
719 _PyUnicode_UTF8(unicode) = NULL;
720 _PyUnicode_UTF8_LENGTH(unicode) = 0;
721 }
722
Victor Stinnerfe226c02011-10-03 03:52:20 +0200723 data = (PyObject *)PyObject_REALLOC(data, new_size);
724 if (data == NULL) {
725 PyErr_NoMemory();
726 return -1;
727 }
728 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200729 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200730 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200731 _PyUnicode_WSTR_LENGTH(unicode) = length;
732 }
733 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200734 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200735 _PyUnicode_UTF8_LENGTH(unicode) = length;
736 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200737 _PyUnicode_LENGTH(unicode) = length;
738 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200739 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200740 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200741 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200742 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200743 }
Victor Stinner95663112011-10-04 01:03:50 +0200744 assert(_PyUnicode_WSTR(unicode) != NULL);
745
746 /* check for integer overflow */
747 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
748 PyErr_NoMemory();
749 return -1;
750 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100751 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200752 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100753 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200754 if (!wstr) {
755 PyErr_NoMemory();
756 return -1;
757 }
758 _PyUnicode_WSTR(unicode) = wstr;
759 _PyUnicode_WSTR(unicode)[length] = 0;
760 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200761 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000762 return 0;
763}
764
Victor Stinnerfe226c02011-10-03 03:52:20 +0200765static PyObject*
766resize_copy(PyObject *unicode, Py_ssize_t length)
767{
768 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100769 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200770 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100771
Benjamin Petersonbac79492012-01-14 13:34:47 -0500772 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100773 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200774
775 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
776 if (copy == NULL)
777 return NULL;
778
779 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +0200780 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200781 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200782 }
783 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200784 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100785
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200786 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200787 if (w == NULL)
788 return NULL;
789 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
790 copy_length = Py_MIN(copy_length, length);
791 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
792 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200793 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200794 }
795}
796
Guido van Rossumd57fd912000-03-10 22:53:23 +0000797/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000798 Ux0000 terminated; some code (e.g. new_identifier)
799 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000800
801 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000802 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000803
804*/
805
Alexander Belopolsky40018472011-02-26 01:02:56 +0000806static PyUnicodeObject *
807_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000808{
809 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200810 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000811
Thomas Wouters477c8d52006-05-27 19:21:47 +0000812 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000813 if (length == 0 && unicode_empty != NULL) {
814 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200815 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000816 }
817
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000818 /* Ensure we won't overflow the size. */
819 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
820 return (PyUnicodeObject *)PyErr_NoMemory();
821 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200822 if (length < 0) {
823 PyErr_SetString(PyExc_SystemError,
824 "Negative size passed to _PyUnicode_New");
825 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000826 }
827
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200828 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
829 if (unicode == NULL)
830 return NULL;
831 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
832 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
833 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100834 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000835 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100836 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000837 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200838
Jeremy Hyltond8082792003-09-16 19:41:39 +0000839 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000840 * the caller fails before initializing str -- unicode_resize()
841 * reads str[0], and the Keep-Alive optimization can keep memory
842 * allocated for str alive across a call to unicode_dealloc(unicode).
843 * We don't want unicode_resize to read uninitialized memory in
844 * that case.
845 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200846 _PyUnicode_WSTR(unicode)[0] = 0;
847 _PyUnicode_WSTR(unicode)[length] = 0;
848 _PyUnicode_WSTR_LENGTH(unicode) = length;
849 _PyUnicode_HASH(unicode) = -1;
850 _PyUnicode_STATE(unicode).interned = 0;
851 _PyUnicode_STATE(unicode).kind = 0;
852 _PyUnicode_STATE(unicode).compact = 0;
853 _PyUnicode_STATE(unicode).ready = 0;
854 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200855 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200856 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200857 _PyUnicode_UTF8(unicode) = NULL;
858 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100859 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000860 return unicode;
861}
862
Victor Stinnerf42dc442011-10-02 23:33:16 +0200863static const char*
864unicode_kind_name(PyObject *unicode)
865{
Victor Stinner42dfd712011-10-03 14:41:45 +0200866 /* don't check consistency: unicode_kind_name() is called from
867 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200868 if (!PyUnicode_IS_COMPACT(unicode))
869 {
870 if (!PyUnicode_IS_READY(unicode))
871 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600872 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200873 {
874 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200875 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200876 return "legacy ascii";
877 else
878 return "legacy latin1";
879 case PyUnicode_2BYTE_KIND:
880 return "legacy UCS2";
881 case PyUnicode_4BYTE_KIND:
882 return "legacy UCS4";
883 default:
884 return "<legacy invalid kind>";
885 }
886 }
887 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600888 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200889 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200890 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200891 return "ascii";
892 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200893 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200894 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200895 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200896 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200897 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200898 default:
899 return "<invalid compact kind>";
900 }
901}
902
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200903#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200904/* Functions wrapping macros for use in debugger */
905char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200906 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200907}
908
909void *_PyUnicode_compact_data(void *unicode) {
910 return _PyUnicode_COMPACT_DATA(unicode);
911}
912void *_PyUnicode_data(void *unicode){
913 printf("obj %p\n", unicode);
914 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
915 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
916 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
917 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
918 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
919 return PyUnicode_DATA(unicode);
920}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200921
922void
923_PyUnicode_Dump(PyObject *op)
924{
925 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200926 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
927 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
928 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200929
Victor Stinnera849a4b2011-10-03 12:12:11 +0200930 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200931 {
932 if (ascii->state.ascii)
933 data = (ascii + 1);
934 else
935 data = (compact + 1);
936 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200937 else
938 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200939 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
940
Victor Stinnera849a4b2011-10-03 12:12:11 +0200941 if (ascii->wstr == data)
942 printf("shared ");
943 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200944
Victor Stinnera3b334d2011-10-03 13:53:37 +0200945 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200946 printf(" (%zu), ", compact->wstr_length);
947 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
948 printf("shared ");
949 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200950 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200951 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200952}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200953#endif
954
955PyObject *
956PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
957{
958 PyObject *obj;
959 PyCompactUnicodeObject *unicode;
960 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +0200961 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200962 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200963 Py_ssize_t char_size;
964 Py_ssize_t struct_size;
965
966 /* Optimization for empty strings */
967 if (size == 0 && unicode_empty != NULL) {
968 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200969 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200970 }
971
Victor Stinner9e9d6892011-10-04 01:02:02 +0200972 is_ascii = 0;
973 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200974 struct_size = sizeof(PyCompactUnicodeObject);
975 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +0200976 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200977 char_size = 1;
978 is_ascii = 1;
979 struct_size = sizeof(PyASCIIObject);
980 }
981 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +0200982 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200983 char_size = 1;
984 }
985 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +0200986 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200987 char_size = 2;
988 if (sizeof(wchar_t) == 2)
989 is_sharing = 1;
990 }
991 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +0100992 if (maxchar > MAX_UNICODE) {
993 PyErr_SetString(PyExc_SystemError,
994 "invalid maximum character passed to PyUnicode_New");
995 return NULL;
996 }
Victor Stinner8f825062012-04-27 13:55:39 +0200997 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200998 char_size = 4;
999 if (sizeof(wchar_t) == 4)
1000 is_sharing = 1;
1001 }
1002
1003 /* Ensure we won't overflow the size. */
1004 if (size < 0) {
1005 PyErr_SetString(PyExc_SystemError,
1006 "Negative size passed to PyUnicode_New");
1007 return NULL;
1008 }
1009 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1010 return PyErr_NoMemory();
1011
1012 /* Duplicated allocation code from _PyObject_New() instead of a call to
1013 * PyObject_New() so we are able to allocate space for the object and
1014 * it's data buffer.
1015 */
1016 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1017 if (obj == NULL)
1018 return PyErr_NoMemory();
1019 obj = PyObject_INIT(obj, &PyUnicode_Type);
1020 if (obj == NULL)
1021 return NULL;
1022
1023 unicode = (PyCompactUnicodeObject *)obj;
1024 if (is_ascii)
1025 data = ((PyASCIIObject*)obj) + 1;
1026 else
1027 data = unicode + 1;
1028 _PyUnicode_LENGTH(unicode) = size;
1029 _PyUnicode_HASH(unicode) = -1;
1030 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001031 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001032 _PyUnicode_STATE(unicode).compact = 1;
1033 _PyUnicode_STATE(unicode).ready = 1;
1034 _PyUnicode_STATE(unicode).ascii = is_ascii;
1035 if (is_ascii) {
1036 ((char*)data)[size] = 0;
1037 _PyUnicode_WSTR(unicode) = NULL;
1038 }
Victor Stinner8f825062012-04-27 13:55:39 +02001039 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001040 ((char*)data)[size] = 0;
1041 _PyUnicode_WSTR(unicode) = NULL;
1042 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001043 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001044 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001045 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001046 else {
1047 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001048 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001049 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001050 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001051 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001052 ((Py_UCS4*)data)[size] = 0;
1053 if (is_sharing) {
1054 _PyUnicode_WSTR_LENGTH(unicode) = size;
1055 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1056 }
1057 else {
1058 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1059 _PyUnicode_WSTR(unicode) = NULL;
1060 }
1061 }
Victor Stinner8f825062012-04-27 13:55:39 +02001062#ifdef Py_DEBUG
1063 /* Fill the data with invalid characters to detect bugs earlier.
1064 _PyUnicode_CheckConsistency(str, 1) detects invalid characters,
1065 at least for ASCII and UCS-4 strings. U+00FF is invalid in ASCII
1066 and U+FFFFFFFF is an invalid character in Unicode 6.0. */
1067 memset(data, 0xff, size * kind);
1068#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001069 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001070 return obj;
1071}
1072
1073#if SIZEOF_WCHAR_T == 2
1074/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1075 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001076 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001077
1078 This function assumes that unicode can hold one more code point than wstr
1079 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001080static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001081unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001082 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001083{
1084 const wchar_t *iter;
1085 Py_UCS4 *ucs4_out;
1086
Victor Stinner910337b2011-10-03 03:20:16 +02001087 assert(unicode != NULL);
1088 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001089 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1090 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1091
1092 for (iter = begin; iter < end; ) {
1093 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1094 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001095 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1096 && (iter+1) < end
1097 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001098 {
Victor Stinner551ac952011-11-29 22:58:13 +01001099 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001100 iter += 2;
1101 }
1102 else {
1103 *ucs4_out++ = *iter;
1104 iter++;
1105 }
1106 }
1107 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1108 _PyUnicode_GET_LENGTH(unicode)));
1109
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001110}
1111#endif
1112
Victor Stinnercd9950f2011-10-02 00:34:53 +02001113static int
Victor Stinner488fa492011-12-12 00:01:39 +01001114unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001115{
Victor Stinner488fa492011-12-12 00:01:39 +01001116 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001117 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001118 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001119 return -1;
1120 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001121 return 0;
1122}
1123
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001124static int
1125_copy_characters(PyObject *to, Py_ssize_t to_start,
1126 PyObject *from, Py_ssize_t from_start,
1127 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001128{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001129 unsigned int from_kind, to_kind;
1130 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001131
Victor Stinneree4544c2012-05-09 22:24:08 +02001132 assert(0 <= how_many);
1133 assert(0 <= from_start);
1134 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001135 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001136 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001137 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001138
Victor Stinnerd3f08822012-05-29 12:57:52 +02001139 assert(PyUnicode_Check(to));
1140 assert(PyUnicode_IS_READY(to));
1141 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1142
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001143 if (how_many == 0)
1144 return 0;
1145
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001146 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001147 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001148 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001149 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001150
Victor Stinnerf1852262012-06-16 16:38:26 +02001151#ifdef Py_DEBUG
1152 if (!check_maxchar
1153 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1154 {
1155 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1156 Py_UCS4 ch;
1157 Py_ssize_t i;
1158 for (i=0; i < how_many; i++) {
1159 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1160 assert(ch <= to_maxchar);
1161 }
1162 }
1163#endif
1164
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001165 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001166 if (check_maxchar
1167 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1168 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001169 /* Writing Latin-1 characters into an ASCII string requires to
1170 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001171 Py_UCS4 max_char;
1172 max_char = ucs1lib_find_max_char(from_data,
1173 (Py_UCS1*)from_data + how_many);
1174 if (max_char >= 128)
1175 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001176 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001177 Py_MEMCPY((char*)to_data + to_kind * to_start,
1178 (char*)from_data + from_kind * from_start,
1179 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001180 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001181 else if (from_kind == PyUnicode_1BYTE_KIND
1182 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001183 {
1184 _PyUnicode_CONVERT_BYTES(
1185 Py_UCS1, Py_UCS2,
1186 PyUnicode_1BYTE_DATA(from) + from_start,
1187 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1188 PyUnicode_2BYTE_DATA(to) + to_start
1189 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001190 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001191 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001192 && to_kind == PyUnicode_4BYTE_KIND)
1193 {
1194 _PyUnicode_CONVERT_BYTES(
1195 Py_UCS1, Py_UCS4,
1196 PyUnicode_1BYTE_DATA(from) + from_start,
1197 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1198 PyUnicode_4BYTE_DATA(to) + to_start
1199 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001200 }
1201 else if (from_kind == PyUnicode_2BYTE_KIND
1202 && to_kind == PyUnicode_4BYTE_KIND)
1203 {
1204 _PyUnicode_CONVERT_BYTES(
1205 Py_UCS2, Py_UCS4,
1206 PyUnicode_2BYTE_DATA(from) + from_start,
1207 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1208 PyUnicode_4BYTE_DATA(to) + to_start
1209 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001210 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001211 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001212 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1213
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001214 if (!check_maxchar) {
1215 if (from_kind == PyUnicode_2BYTE_KIND
1216 && to_kind == PyUnicode_1BYTE_KIND)
1217 {
1218 _PyUnicode_CONVERT_BYTES(
1219 Py_UCS2, Py_UCS1,
1220 PyUnicode_2BYTE_DATA(from) + from_start,
1221 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1222 PyUnicode_1BYTE_DATA(to) + to_start
1223 );
1224 }
1225 else if (from_kind == PyUnicode_4BYTE_KIND
1226 && to_kind == PyUnicode_1BYTE_KIND)
1227 {
1228 _PyUnicode_CONVERT_BYTES(
1229 Py_UCS4, Py_UCS1,
1230 PyUnicode_4BYTE_DATA(from) + from_start,
1231 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1232 PyUnicode_1BYTE_DATA(to) + to_start
1233 );
1234 }
1235 else if (from_kind == PyUnicode_4BYTE_KIND
1236 && to_kind == PyUnicode_2BYTE_KIND)
1237 {
1238 _PyUnicode_CONVERT_BYTES(
1239 Py_UCS4, Py_UCS2,
1240 PyUnicode_4BYTE_DATA(from) + from_start,
1241 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1242 PyUnicode_2BYTE_DATA(to) + to_start
1243 );
1244 }
1245 else {
1246 assert(0);
1247 return -1;
1248 }
1249 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001250 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001251 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001252 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001253 Py_ssize_t i;
1254
Victor Stinnera0702ab2011-09-29 14:14:38 +02001255 for (i=0; i < how_many; i++) {
1256 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001257 if (ch > to_maxchar)
1258 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001259 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1260 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001261 }
1262 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001263 return 0;
1264}
1265
Victor Stinnerd3f08822012-05-29 12:57:52 +02001266void
1267_PyUnicode_FastCopyCharacters(
1268 PyObject *to, Py_ssize_t to_start,
1269 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001270{
1271 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1272}
1273
1274Py_ssize_t
1275PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1276 PyObject *from, Py_ssize_t from_start,
1277 Py_ssize_t how_many)
1278{
1279 int err;
1280
1281 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1282 PyErr_BadInternalCall();
1283 return -1;
1284 }
1285
Benjamin Petersonbac79492012-01-14 13:34:47 -05001286 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001287 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001288 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001289 return -1;
1290
Victor Stinnerd3f08822012-05-29 12:57:52 +02001291 if (from_start < 0) {
1292 PyErr_SetString(PyExc_IndexError, "string index out of range");
1293 return -1;
1294 }
1295 if (to_start < 0) {
1296 PyErr_SetString(PyExc_IndexError, "string index out of range");
1297 return -1;
1298 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001299 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1300 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1301 PyErr_Format(PyExc_SystemError,
1302 "Cannot write %zi characters at %zi "
1303 "in a string of %zi characters",
1304 how_many, to_start, PyUnicode_GET_LENGTH(to));
1305 return -1;
1306 }
1307
1308 if (how_many == 0)
1309 return 0;
1310
Victor Stinner488fa492011-12-12 00:01:39 +01001311 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001312 return -1;
1313
1314 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1315 if (err) {
1316 PyErr_Format(PyExc_SystemError,
1317 "Cannot copy %s characters "
1318 "into a string of %s characters",
1319 unicode_kind_name(from),
1320 unicode_kind_name(to));
1321 return -1;
1322 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001323 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001324}
1325
Victor Stinner17222162011-09-28 22:15:37 +02001326/* Find the maximum code point and count the number of surrogate pairs so a
1327 correct string length can be computed before converting a string to UCS4.
1328 This function counts single surrogates as a character and not as a pair.
1329
1330 Return 0 on success, or -1 on error. */
1331static int
1332find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1333 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001334{
1335 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001336 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001337
Victor Stinnerc53be962011-10-02 21:33:54 +02001338 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001339 *num_surrogates = 0;
1340 *maxchar = 0;
1341
1342 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001343#if SIZEOF_WCHAR_T == 2
Victor Stinnerca4f2072011-11-22 03:38:40 +01001344 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1345 && (iter+1) < end
1346 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001347 {
Victor Stinner8faf8212011-12-08 22:14:11 +01001348 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001349 ++(*num_surrogates);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001350 iter += 2;
1351 }
1352 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001353#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001354 {
1355 ch = *iter;
1356 iter++;
1357 }
1358 if (ch > *maxchar) {
1359 *maxchar = ch;
1360 if (*maxchar > MAX_UNICODE) {
1361 PyErr_Format(PyExc_ValueError,
1362 "character U+%x is not in range [U+0000; U+10ffff]",
1363 ch);
1364 return -1;
1365 }
1366 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001367 }
1368 return 0;
1369}
1370
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001371int
1372_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001373{
1374 wchar_t *end;
1375 Py_UCS4 maxchar = 0;
1376 Py_ssize_t num_surrogates;
1377#if SIZEOF_WCHAR_T == 2
1378 Py_ssize_t length_wo_surrogates;
1379#endif
1380
Georg Brandl7597add2011-10-05 16:36:47 +02001381 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001382 strings were created using _PyObject_New() and where no canonical
1383 representation (the str field) has been set yet aka strings
1384 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001385 assert(_PyUnicode_CHECK(unicode));
1386 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001387 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001388 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001389 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001390 /* Actually, it should neither be interned nor be anything else: */
1391 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001392
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001393 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001394 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001395 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001396 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001397
1398 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001399 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1400 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001401 PyErr_NoMemory();
1402 return -1;
1403 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001404 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001405 _PyUnicode_WSTR(unicode), end,
1406 PyUnicode_1BYTE_DATA(unicode));
1407 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1408 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1409 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1410 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001411 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001412 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001413 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001414 }
1415 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001416 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001417 _PyUnicode_UTF8(unicode) = NULL;
1418 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001419 }
1420 PyObject_FREE(_PyUnicode_WSTR(unicode));
1421 _PyUnicode_WSTR(unicode) = NULL;
1422 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1423 }
1424 /* In this case we might have to convert down from 4-byte native
1425 wchar_t to 2-byte unicode. */
1426 else if (maxchar < 65536) {
1427 assert(num_surrogates == 0 &&
1428 "FindMaxCharAndNumSurrogatePairs() messed up");
1429
Victor Stinner506f5922011-09-28 22:34:18 +02001430#if SIZEOF_WCHAR_T == 2
1431 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001432 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001433 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1434 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1435 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001436 _PyUnicode_UTF8(unicode) = NULL;
1437 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001438#else
1439 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001440 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001441 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001442 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001443 PyErr_NoMemory();
1444 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001445 }
Victor Stinner506f5922011-09-28 22:34:18 +02001446 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1447 _PyUnicode_WSTR(unicode), end,
1448 PyUnicode_2BYTE_DATA(unicode));
1449 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1450 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1451 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001452 _PyUnicode_UTF8(unicode) = NULL;
1453 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001454 PyObject_FREE(_PyUnicode_WSTR(unicode));
1455 _PyUnicode_WSTR(unicode) = NULL;
1456 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1457#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001458 }
1459 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1460 else {
1461#if SIZEOF_WCHAR_T == 2
1462 /* in case the native representation is 2-bytes, we need to allocate a
1463 new normalized 4-byte version. */
1464 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001465 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1466 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001467 PyErr_NoMemory();
1468 return -1;
1469 }
1470 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1471 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001472 _PyUnicode_UTF8(unicode) = NULL;
1473 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001474 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1475 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001476 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001477 PyObject_FREE(_PyUnicode_WSTR(unicode));
1478 _PyUnicode_WSTR(unicode) = NULL;
1479 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1480#else
1481 assert(num_surrogates == 0);
1482
Victor Stinnerc3c74152011-10-02 20:39:55 +02001483 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001484 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001485 _PyUnicode_UTF8(unicode) = NULL;
1486 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001487 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1488#endif
1489 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1490 }
1491 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001492 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001493 return 0;
1494}
1495
Alexander Belopolsky40018472011-02-26 01:02:56 +00001496static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001497unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001498{
Walter Dörwald16807132007-05-25 13:52:07 +00001499 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001500 case SSTATE_NOT_INTERNED:
1501 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001502
Benjamin Peterson29060642009-01-31 22:14:21 +00001503 case SSTATE_INTERNED_MORTAL:
1504 /* revive dead object temporarily for DelItem */
1505 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001506 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001507 Py_FatalError(
1508 "deletion of interned string failed");
1509 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001510
Benjamin Peterson29060642009-01-31 22:14:21 +00001511 case SSTATE_INTERNED_IMMORTAL:
1512 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001513
Benjamin Peterson29060642009-01-31 22:14:21 +00001514 default:
1515 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001516 }
1517
Victor Stinner03490912011-10-03 23:45:12 +02001518 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001519 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001520 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001521 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001522 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1523 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001524
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001525 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001526}
1527
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001528#ifdef Py_DEBUG
1529static int
1530unicode_is_singleton(PyObject *unicode)
1531{
1532 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1533 if (unicode == unicode_empty)
1534 return 1;
1535 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1536 {
1537 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1538 if (ch < 256 && unicode_latin1[ch] == unicode)
1539 return 1;
1540 }
1541 return 0;
1542}
1543#endif
1544
Alexander Belopolsky40018472011-02-26 01:02:56 +00001545static int
Victor Stinner488fa492011-12-12 00:01:39 +01001546unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001547{
Victor Stinner488fa492011-12-12 00:01:39 +01001548 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001549 if (Py_REFCNT(unicode) != 1)
1550 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001551 if (_PyUnicode_HASH(unicode) != -1)
1552 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001553 if (PyUnicode_CHECK_INTERNED(unicode))
1554 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001555 if (!PyUnicode_CheckExact(unicode))
1556 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001557#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001558 /* singleton refcount is greater than 1 */
1559 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001560#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001561 return 1;
1562}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001563
Victor Stinnerfe226c02011-10-03 03:52:20 +02001564static int
1565unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1566{
1567 PyObject *unicode;
1568 Py_ssize_t old_length;
1569
1570 assert(p_unicode != NULL);
1571 unicode = *p_unicode;
1572
1573 assert(unicode != NULL);
1574 assert(PyUnicode_Check(unicode));
1575 assert(0 <= length);
1576
Victor Stinner910337b2011-10-03 03:20:16 +02001577 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001578 old_length = PyUnicode_WSTR_LENGTH(unicode);
1579 else
1580 old_length = PyUnicode_GET_LENGTH(unicode);
1581 if (old_length == length)
1582 return 0;
1583
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001584 if (length == 0) {
1585 Py_DECREF(*p_unicode);
1586 *p_unicode = unicode_empty;
1587 Py_INCREF(*p_unicode);
1588 return 0;
1589 }
1590
Victor Stinner488fa492011-12-12 00:01:39 +01001591 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001592 PyObject *copy = resize_copy(unicode, length);
1593 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001594 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001595 Py_DECREF(*p_unicode);
1596 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001597 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001598 }
1599
Victor Stinnerfe226c02011-10-03 03:52:20 +02001600 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001601 PyObject *new_unicode = resize_compact(unicode, length);
1602 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001603 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001604 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001605 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001606 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001607 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001608}
1609
Alexander Belopolsky40018472011-02-26 01:02:56 +00001610int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001611PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001612{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001613 PyObject *unicode;
1614 if (p_unicode == NULL) {
1615 PyErr_BadInternalCall();
1616 return -1;
1617 }
1618 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001619 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001620 {
1621 PyErr_BadInternalCall();
1622 return -1;
1623 }
1624 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001625}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001626
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001627static int
Victor Stinner1b487b42012-05-03 12:29:04 +02001628unicode_widen(PyObject **p_unicode, Py_ssize_t length,
1629 unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001630{
1631 PyObject *result;
1632 assert(PyUnicode_IS_READY(*p_unicode));
Victor Stinner1b487b42012-05-03 12:29:04 +02001633 assert(length <= PyUnicode_GET_LENGTH(*p_unicode));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001634 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1635 return 0;
1636 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1637 maxchar);
1638 if (result == NULL)
1639 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +02001640 _PyUnicode_FastCopyCharacters(result, 0, *p_unicode, 0, length);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001641 Py_DECREF(*p_unicode);
1642 *p_unicode = result;
1643 return 0;
1644}
1645
1646static int
1647unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1648 Py_UCS4 ch)
1649{
Victor Stinner15e9ed22012-02-22 13:36:20 +01001650 assert(ch <= MAX_UNICODE);
Victor Stinner1b487b42012-05-03 12:29:04 +02001651 if (unicode_widen(p_unicode, *pos, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001652 return -1;
1653 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1654 PyUnicode_DATA(*p_unicode),
1655 (*pos)++, ch);
1656 return 0;
1657}
1658
Victor Stinnerc5166102012-02-22 13:55:02 +01001659/* Copy a ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001660
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001661 WARNING: The function doesn't copy the terminating null character and
1662 doesn't check the maximum character (may write a latin1 character in an
1663 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001664static void
1665unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1666 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001667{
1668 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1669 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001670 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001671
1672 switch (kind) {
1673 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001674 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001675 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001676 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001677 }
1678 case PyUnicode_2BYTE_KIND: {
1679 Py_UCS2 *start = (Py_UCS2 *)data + index;
1680 Py_UCS2 *ucs2 = start;
1681 assert(index <= PyUnicode_GET_LENGTH(unicode));
1682
Victor Stinner184252a2012-06-16 02:57:41 +02001683 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001684 *ucs2 = (Py_UCS2)*str;
1685
1686 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001687 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001688 }
1689 default: {
1690 Py_UCS4 *start = (Py_UCS4 *)data + index;
1691 Py_UCS4 *ucs4 = start;
1692 assert(kind == PyUnicode_4BYTE_KIND);
1693 assert(index <= PyUnicode_GET_LENGTH(unicode));
1694
Victor Stinner184252a2012-06-16 02:57:41 +02001695 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001696 *ucs4 = (Py_UCS4)*str;
1697
1698 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001699 }
1700 }
1701}
1702
1703
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001704static PyObject*
1705get_latin1_char(unsigned char ch)
1706{
Victor Stinnera464fc12011-10-02 20:39:30 +02001707 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001708 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001709 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001710 if (!unicode)
1711 return NULL;
1712 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001713 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001714 unicode_latin1[ch] = unicode;
1715 }
1716 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001717 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001718}
1719
Alexander Belopolsky40018472011-02-26 01:02:56 +00001720PyObject *
1721PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001722{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001723 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001724 Py_UCS4 maxchar = 0;
1725 Py_ssize_t num_surrogates;
1726
1727 if (u == NULL)
1728 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001729
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001730 /* If the Unicode data is known at construction time, we can apply
1731 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001732
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001733 /* Optimization for empty strings */
1734 if (size == 0 && unicode_empty != NULL) {
1735 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001736 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001737 }
Tim Petersced69f82003-09-16 20:30:58 +00001738
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001739 /* Single character Unicode objects in the Latin-1 range are
1740 shared when using this constructor */
1741 if (size == 1 && *u < 256)
1742 return get_latin1_char((unsigned char)*u);
1743
1744 /* If not empty and not single character, copy the Unicode data
1745 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001746 if (find_maxchar_surrogates(u, u + size,
1747 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001748 return NULL;
1749
Victor Stinner8faf8212011-12-08 22:14:11 +01001750 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001751 if (!unicode)
1752 return NULL;
1753
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001754 switch (PyUnicode_KIND(unicode)) {
1755 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001756 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001757 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1758 break;
1759 case PyUnicode_2BYTE_KIND:
1760#if Py_UNICODE_SIZE == 2
1761 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1762#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001763 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001764 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1765#endif
1766 break;
1767 case PyUnicode_4BYTE_KIND:
1768#if SIZEOF_WCHAR_T == 2
1769 /* This is the only case which has to process surrogates, thus
1770 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001771 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001772#else
1773 assert(num_surrogates == 0);
1774 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1775#endif
1776 break;
1777 default:
1778 assert(0 && "Impossible state");
1779 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001780
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001781 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001782}
1783
Alexander Belopolsky40018472011-02-26 01:02:56 +00001784PyObject *
1785PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001786{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001787 if (size < 0) {
1788 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001789 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001790 return NULL;
1791 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001792 if (u != NULL)
1793 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1794 else
1795 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001796}
1797
Alexander Belopolsky40018472011-02-26 01:02:56 +00001798PyObject *
1799PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001800{
1801 size_t size = strlen(u);
1802 if (size > PY_SSIZE_T_MAX) {
1803 PyErr_SetString(PyExc_OverflowError, "input too long");
1804 return NULL;
1805 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001806 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001807}
1808
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001809PyObject *
1810_PyUnicode_FromId(_Py_Identifier *id)
1811{
1812 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001813 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1814 strlen(id->string),
1815 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001816 if (!id->object)
1817 return NULL;
1818 PyUnicode_InternInPlace(&id->object);
1819 assert(!id->next);
1820 id->next = static_strings;
1821 static_strings = id;
1822 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001823 return id->object;
1824}
1825
1826void
1827_PyUnicode_ClearStaticStrings()
1828{
1829 _Py_Identifier *i;
1830 for (i = static_strings; i; i = i->next) {
1831 Py_DECREF(i->object);
1832 i->object = NULL;
1833 i->next = NULL;
1834 }
1835}
1836
Benjamin Peterson0df54292012-03-26 14:50:32 -04001837/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001838
Victor Stinnerd3f08822012-05-29 12:57:52 +02001839PyObject*
1840_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001841{
Victor Stinnerd3f08822012-05-29 12:57:52 +02001842 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01001843 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001844 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001845#ifdef Py_DEBUG
Victor Stinnere6b2d442011-12-11 21:54:30 +01001846 assert(s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001847#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001848 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001849 }
Victor Stinner785938e2011-12-11 20:09:03 +01001850 unicode = PyUnicode_New(size, 127);
1851 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001852 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001853 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1854 assert(_PyUnicode_CheckConsistency(unicode, 1));
1855 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001856}
1857
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001858static Py_UCS4
1859kind_maxchar_limit(unsigned int kind)
1860{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001861 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001862 case PyUnicode_1BYTE_KIND:
1863 return 0x80;
1864 case PyUnicode_2BYTE_KIND:
1865 return 0x100;
1866 case PyUnicode_4BYTE_KIND:
1867 return 0x10000;
1868 default:
1869 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001870 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001871 }
1872}
1873
Victor Stinnere6abb482012-05-02 01:15:40 +02001874Py_LOCAL_INLINE(Py_UCS4)
1875align_maxchar(Py_UCS4 maxchar)
1876{
1877 if (maxchar <= 127)
1878 return 127;
1879 else if (maxchar <= 255)
1880 return 255;
1881 else if (maxchar <= 65535)
1882 return 65535;
1883 else
1884 return MAX_UNICODE;
1885}
1886
Victor Stinner702c7342011-10-05 13:50:52 +02001887static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001888_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001889{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001890 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001891 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001892
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001893 if (size == 0) {
1894 Py_INCREF(unicode_empty);
1895 return unicode_empty;
1896 }
1897 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001898 if (size == 1)
1899 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001900
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001901 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001902 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001903 if (!res)
1904 return NULL;
1905 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001906 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001907 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001908}
1909
Victor Stinnere57b1c02011-09-28 22:20:48 +02001910static PyObject*
1911_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001912{
1913 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001914 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001915
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001916 if (size == 0) {
1917 Py_INCREF(unicode_empty);
1918 return unicode_empty;
1919 }
1920 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001921 if (size == 1) {
1922 Py_UCS4 ch = u[0];
1923 if (ch < 256)
1924 return get_latin1_char((unsigned char)ch);
1925
1926 res = PyUnicode_New(1, ch);
1927 if (res == NULL)
1928 return NULL;
1929 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1930 assert(_PyUnicode_CheckConsistency(res, 1));
1931 return res;
1932 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001933
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001934 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001935 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001936 if (!res)
1937 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001938 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001939 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001940 else {
1941 _PyUnicode_CONVERT_BYTES(
1942 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1943 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001944 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001945 return res;
1946}
1947
Victor Stinnere57b1c02011-09-28 22:20:48 +02001948static PyObject*
1949_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001950{
1951 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001952 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001953
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001954 if (size == 0) {
1955 Py_INCREF(unicode_empty);
1956 return unicode_empty;
1957 }
1958 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001959 if (size == 1) {
1960 Py_UCS4 ch = u[0];
1961 if (ch < 256)
1962 return get_latin1_char((unsigned char)ch);
1963
1964 res = PyUnicode_New(1, ch);
1965 if (res == NULL)
1966 return NULL;
1967 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1968 assert(_PyUnicode_CheckConsistency(res, 1));
1969 return res;
1970 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001971
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001972 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001973 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001974 if (!res)
1975 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001976 if (max_char < 256)
1977 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1978 PyUnicode_1BYTE_DATA(res));
1979 else if (max_char < 0x10000)
1980 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1981 PyUnicode_2BYTE_DATA(res));
1982 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001983 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001984 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001985 return res;
1986}
1987
1988PyObject*
1989PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1990{
Victor Stinnercfed46e2011-11-22 01:29:14 +01001991 if (size < 0) {
1992 PyErr_SetString(PyExc_ValueError, "size must be positive");
1993 return NULL;
1994 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06001995 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001996 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001997 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001998 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001999 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002000 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002001 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002002 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002003 PyErr_SetString(PyExc_SystemError, "invalid kind");
2004 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002005 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002006}
2007
Victor Stinnerece58de2012-04-23 23:36:38 +02002008Py_UCS4
2009_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2010{
2011 enum PyUnicode_Kind kind;
2012 void *startptr, *endptr;
2013
2014 assert(PyUnicode_IS_READY(unicode));
2015 assert(0 <= start);
2016 assert(end <= PyUnicode_GET_LENGTH(unicode));
2017 assert(start <= end);
2018
2019 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2020 return PyUnicode_MAX_CHAR_VALUE(unicode);
2021
2022 if (start == end)
2023 return 127;
2024
Victor Stinner94d558b2012-04-27 22:26:58 +02002025 if (PyUnicode_IS_ASCII(unicode))
2026 return 127;
2027
Victor Stinnerece58de2012-04-23 23:36:38 +02002028 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002029 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002030 endptr = (char *)startptr + end * kind;
2031 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002032 switch(kind) {
2033 case PyUnicode_1BYTE_KIND:
2034 return ucs1lib_find_max_char(startptr, endptr);
2035 case PyUnicode_2BYTE_KIND:
2036 return ucs2lib_find_max_char(startptr, endptr);
2037 case PyUnicode_4BYTE_KIND:
2038 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002039 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002040 assert(0);
2041 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002042 }
2043}
2044
Victor Stinner25a4b292011-10-06 12:31:55 +02002045/* Ensure that a string uses the most efficient storage, if it is not the
2046 case: create a new string with of the right kind. Write NULL into *p_unicode
2047 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002048static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002049unicode_adjust_maxchar(PyObject **p_unicode)
2050{
2051 PyObject *unicode, *copy;
2052 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002053 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002054 unsigned int kind;
2055
2056 assert(p_unicode != NULL);
2057 unicode = *p_unicode;
2058 assert(PyUnicode_IS_READY(unicode));
2059 if (PyUnicode_IS_ASCII(unicode))
2060 return;
2061
2062 len = PyUnicode_GET_LENGTH(unicode);
2063 kind = PyUnicode_KIND(unicode);
2064 if (kind == PyUnicode_1BYTE_KIND) {
2065 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002066 max_char = ucs1lib_find_max_char(u, u + len);
2067 if (max_char >= 128)
2068 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002069 }
2070 else if (kind == PyUnicode_2BYTE_KIND) {
2071 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002072 max_char = ucs2lib_find_max_char(u, u + len);
2073 if (max_char >= 256)
2074 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002075 }
2076 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002077 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002078 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002079 max_char = ucs4lib_find_max_char(u, u + len);
2080 if (max_char >= 0x10000)
2081 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002082 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002083 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002084 if (copy != NULL)
2085 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002086 Py_DECREF(unicode);
2087 *p_unicode = copy;
2088}
2089
Victor Stinner034f6cf2011-09-30 02:26:44 +02002090PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002091_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002092{
Victor Stinner87af4f22011-11-21 23:03:47 +01002093 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002094 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002095
Victor Stinner034f6cf2011-09-30 02:26:44 +02002096 if (!PyUnicode_Check(unicode)) {
2097 PyErr_BadInternalCall();
2098 return NULL;
2099 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002100 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002101 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002102
Victor Stinner87af4f22011-11-21 23:03:47 +01002103 length = PyUnicode_GET_LENGTH(unicode);
2104 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002105 if (!copy)
2106 return NULL;
2107 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2108
Victor Stinner87af4f22011-11-21 23:03:47 +01002109 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2110 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002111 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002112 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002113}
2114
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002115
Victor Stinnerbc603d12011-10-02 01:00:40 +02002116/* Widen Unicode objects to larger buffers. Don't write terminating null
2117 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002118
2119void*
2120_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2121{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002122 Py_ssize_t len;
2123 void *result;
2124 unsigned int skind;
2125
Benjamin Petersonbac79492012-01-14 13:34:47 -05002126 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002127 return NULL;
2128
2129 len = PyUnicode_GET_LENGTH(s);
2130 skind = PyUnicode_KIND(s);
2131 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002132 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002133 return NULL;
2134 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002135 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002136 case PyUnicode_2BYTE_KIND:
2137 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2138 if (!result)
2139 return PyErr_NoMemory();
2140 assert(skind == PyUnicode_1BYTE_KIND);
2141 _PyUnicode_CONVERT_BYTES(
2142 Py_UCS1, Py_UCS2,
2143 PyUnicode_1BYTE_DATA(s),
2144 PyUnicode_1BYTE_DATA(s) + len,
2145 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002146 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002147 case PyUnicode_4BYTE_KIND:
2148 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2149 if (!result)
2150 return PyErr_NoMemory();
2151 if (skind == PyUnicode_2BYTE_KIND) {
2152 _PyUnicode_CONVERT_BYTES(
2153 Py_UCS2, Py_UCS4,
2154 PyUnicode_2BYTE_DATA(s),
2155 PyUnicode_2BYTE_DATA(s) + len,
2156 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002157 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002158 else {
2159 assert(skind == PyUnicode_1BYTE_KIND);
2160 _PyUnicode_CONVERT_BYTES(
2161 Py_UCS1, Py_UCS4,
2162 PyUnicode_1BYTE_DATA(s),
2163 PyUnicode_1BYTE_DATA(s) + len,
2164 result);
2165 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002166 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002167 default:
2168 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002169 }
Victor Stinner01698042011-10-04 00:04:26 +02002170 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002171 return NULL;
2172}
2173
2174static Py_UCS4*
2175as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2176 int copy_null)
2177{
2178 int kind;
2179 void *data;
2180 Py_ssize_t len, targetlen;
2181 if (PyUnicode_READY(string) == -1)
2182 return NULL;
2183 kind = PyUnicode_KIND(string);
2184 data = PyUnicode_DATA(string);
2185 len = PyUnicode_GET_LENGTH(string);
2186 targetlen = len;
2187 if (copy_null)
2188 targetlen++;
2189 if (!target) {
2190 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2191 PyErr_NoMemory();
2192 return NULL;
2193 }
2194 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2195 if (!target) {
2196 PyErr_NoMemory();
2197 return NULL;
2198 }
2199 }
2200 else {
2201 if (targetsize < targetlen) {
2202 PyErr_Format(PyExc_SystemError,
2203 "string is longer than the buffer");
2204 if (copy_null && 0 < targetsize)
2205 target[0] = 0;
2206 return NULL;
2207 }
2208 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002209 if (kind == PyUnicode_1BYTE_KIND) {
2210 Py_UCS1 *start = (Py_UCS1 *) data;
2211 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002212 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002213 else if (kind == PyUnicode_2BYTE_KIND) {
2214 Py_UCS2 *start = (Py_UCS2 *) data;
2215 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2216 }
2217 else {
2218 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002219 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002220 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002221 if (copy_null)
2222 target[len] = 0;
2223 return target;
2224}
2225
2226Py_UCS4*
2227PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2228 int copy_null)
2229{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002230 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002231 PyErr_BadInternalCall();
2232 return NULL;
2233 }
2234 return as_ucs4(string, target, targetsize, copy_null);
2235}
2236
2237Py_UCS4*
2238PyUnicode_AsUCS4Copy(PyObject *string)
2239{
2240 return as_ucs4(string, NULL, 0, 1);
2241}
2242
2243#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002244
Alexander Belopolsky40018472011-02-26 01:02:56 +00002245PyObject *
2246PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002247{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002248 if (w == NULL) {
Victor Stinner382955f2011-12-11 21:44:00 +01002249 if (size == 0) {
2250 Py_INCREF(unicode_empty);
2251 return unicode_empty;
2252 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002253 PyErr_BadInternalCall();
2254 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002255 }
2256
Martin v. Löwis790465f2008-04-05 20:41:37 +00002257 if (size == -1) {
2258 size = wcslen(w);
2259 }
2260
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002261 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002262}
2263
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002264#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002265
Walter Dörwald346737f2007-05-31 10:44:43 +00002266static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002267makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2268 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002269{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002270 *fmt++ = '%';
2271 if (width) {
2272 if (zeropad)
2273 *fmt++ = '0';
2274 fmt += sprintf(fmt, "%d", width);
2275 }
2276 if (precision)
2277 fmt += sprintf(fmt, ".%d", precision);
2278 if (longflag)
2279 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002280 else if (longlongflag) {
2281 /* longlongflag should only ever be nonzero on machines with
2282 HAVE_LONG_LONG defined */
2283#ifdef HAVE_LONG_LONG
2284 char *f = PY_FORMAT_LONG_LONG;
2285 while (*f)
2286 *fmt++ = *f++;
2287#else
2288 /* we shouldn't ever get here */
2289 assert(0);
2290 *fmt++ = 'l';
2291#endif
2292 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002293 else if (size_tflag) {
2294 char *f = PY_FORMAT_SIZE_T;
2295 while (*f)
2296 *fmt++ = *f++;
2297 }
2298 *fmt++ = c;
2299 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002300}
2301
Victor Stinner96865452011-03-01 23:44:09 +00002302/* helper for PyUnicode_FromFormatV() */
2303
2304static const char*
2305parse_format_flags(const char *f,
2306 int *p_width, int *p_precision,
2307 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2308{
2309 int width, precision, longflag, longlongflag, size_tflag;
2310
2311 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2312 f++;
2313 width = 0;
2314 while (Py_ISDIGIT((unsigned)*f))
2315 width = (width*10) + *f++ - '0';
2316 precision = 0;
2317 if (*f == '.') {
2318 f++;
2319 while (Py_ISDIGIT((unsigned)*f))
2320 precision = (precision*10) + *f++ - '0';
2321 if (*f == '%') {
2322 /* "%.3%s" => f points to "3" */
2323 f--;
2324 }
2325 }
2326 if (*f == '\0') {
2327 /* bogus format "%.1" => go backward, f points to "1" */
2328 f--;
2329 }
2330 if (p_width != NULL)
2331 *p_width = width;
2332 if (p_precision != NULL)
2333 *p_precision = precision;
2334
2335 /* Handle %ld, %lu, %lld and %llu. */
2336 longflag = 0;
2337 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002338 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002339
2340 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002341 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002342 longflag = 1;
2343 ++f;
2344 }
2345#ifdef HAVE_LONG_LONG
2346 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002347 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002348 longlongflag = 1;
2349 f += 2;
2350 }
2351#endif
2352 }
2353 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002354 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002355 size_tflag = 1;
2356 ++f;
2357 }
2358 if (p_longflag != NULL)
2359 *p_longflag = longflag;
2360 if (p_longlongflag != NULL)
2361 *p_longlongflag = longlongflag;
2362 if (p_size_tflag != NULL)
2363 *p_size_tflag = size_tflag;
2364 return f;
2365}
2366
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002367/* maximum number of characters required for output of %ld. 21 characters
2368 allows for 64-bit integers (in decimal) and an optional sign. */
2369#define MAX_LONG_CHARS 21
2370/* maximum number of characters required for output of %lld.
2371 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2372 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2373#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2374
Walter Dörwaldd2034312007-05-18 16:29:38 +00002375PyObject *
2376PyUnicode_FromFormatV(const char *format, va_list vargs)
2377{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002378 va_list count;
2379 Py_ssize_t callcount = 0;
2380 PyObject **callresults = NULL;
2381 PyObject **callresult = NULL;
2382 Py_ssize_t n = 0;
2383 int width = 0;
2384 int precision = 0;
2385 int zeropad;
2386 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002387 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002388 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002389 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002390 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2391 Py_UCS4 argmaxchar;
2392 Py_ssize_t numbersize = 0;
2393 char *numberresults = NULL;
2394 char *numberresult = NULL;
2395 Py_ssize_t i;
2396 int kind;
2397 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002398
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002399 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002400 /* step 1: count the number of %S/%R/%A/%s format specifications
2401 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2402 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002403 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002404 * also estimate a upper bound for all the number formats in the string,
2405 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002406 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002407 for (f = format; *f; f++) {
2408 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002409 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002410 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2411 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2412 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2413 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002414
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002415 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002416#ifdef HAVE_LONG_LONG
2417 if (longlongflag) {
2418 if (width < MAX_LONG_LONG_CHARS)
2419 width = MAX_LONG_LONG_CHARS;
2420 }
2421 else
2422#endif
2423 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2424 including sign. Decimal takes the most space. This
2425 isn't enough for octal. If a width is specified we
2426 need more (which we allocate later). */
2427 if (width < MAX_LONG_CHARS)
2428 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002429
2430 /* account for the size + '\0' to separate numbers
2431 inside of the numberresults buffer */
2432 numbersize += (width + 1);
2433 }
2434 }
2435 else if ((unsigned char)*f > 127) {
2436 PyErr_Format(PyExc_ValueError,
2437 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2438 "string, got a non-ASCII byte: 0x%02x",
2439 (unsigned char)*f);
2440 return NULL;
2441 }
2442 }
2443 /* step 2: allocate memory for the results of
2444 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2445 if (callcount) {
2446 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2447 if (!callresults) {
2448 PyErr_NoMemory();
2449 return NULL;
2450 }
2451 callresult = callresults;
2452 }
2453 /* step 2.5: allocate memory for the results of formating numbers */
2454 if (numbersize) {
2455 numberresults = PyObject_Malloc(numbersize);
2456 if (!numberresults) {
2457 PyErr_NoMemory();
2458 goto fail;
2459 }
2460 numberresult = numberresults;
2461 }
2462
2463 /* step 3: format numbers and figure out how large a buffer we need */
2464 for (f = format; *f; f++) {
2465 if (*f == '%') {
2466 const char* p;
2467 int longflag;
2468 int longlongflag;
2469 int size_tflag;
2470 int numprinted;
2471
2472 p = f;
2473 zeropad = (f[1] == '0');
2474 f = parse_format_flags(f, &width, &precision,
2475 &longflag, &longlongflag, &size_tflag);
2476 switch (*f) {
2477 case 'c':
2478 {
2479 Py_UCS4 ordinal = va_arg(count, int);
Victor Stinnere6abb482012-05-02 01:15:40 +02002480 maxchar = MAX_MAXCHAR(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002481 n++;
2482 break;
2483 }
2484 case '%':
2485 n++;
2486 break;
2487 case 'i':
2488 case 'd':
2489 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2490 width, precision, *f);
2491 if (longflag)
2492 numprinted = sprintf(numberresult, fmt,
2493 va_arg(count, long));
2494#ifdef HAVE_LONG_LONG
2495 else if (longlongflag)
2496 numprinted = sprintf(numberresult, fmt,
2497 va_arg(count, PY_LONG_LONG));
2498#endif
2499 else if (size_tflag)
2500 numprinted = sprintf(numberresult, fmt,
2501 va_arg(count, Py_ssize_t));
2502 else
2503 numprinted = sprintf(numberresult, fmt,
2504 va_arg(count, int));
2505 n += numprinted;
2506 /* advance by +1 to skip over the '\0' */
2507 numberresult += (numprinted + 1);
2508 assert(*(numberresult - 1) == '\0');
2509 assert(*(numberresult - 2) != '\0');
2510 assert(numprinted >= 0);
2511 assert(numberresult <= numberresults + numbersize);
2512 break;
2513 case 'u':
2514 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2515 width, precision, 'u');
2516 if (longflag)
2517 numprinted = sprintf(numberresult, fmt,
2518 va_arg(count, unsigned long));
2519#ifdef HAVE_LONG_LONG
2520 else if (longlongflag)
2521 numprinted = sprintf(numberresult, fmt,
2522 va_arg(count, unsigned PY_LONG_LONG));
2523#endif
2524 else if (size_tflag)
2525 numprinted = sprintf(numberresult, fmt,
2526 va_arg(count, size_t));
2527 else
2528 numprinted = sprintf(numberresult, fmt,
2529 va_arg(count, unsigned int));
2530 n += numprinted;
2531 numberresult += (numprinted + 1);
2532 assert(*(numberresult - 1) == '\0');
2533 assert(*(numberresult - 2) != '\0');
2534 assert(numprinted >= 0);
2535 assert(numberresult <= numberresults + numbersize);
2536 break;
2537 case 'x':
2538 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2539 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2540 n += numprinted;
2541 numberresult += (numprinted + 1);
2542 assert(*(numberresult - 1) == '\0');
2543 assert(*(numberresult - 2) != '\0');
2544 assert(numprinted >= 0);
2545 assert(numberresult <= numberresults + numbersize);
2546 break;
2547 case 'p':
2548 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2549 /* %p is ill-defined: ensure leading 0x. */
2550 if (numberresult[1] == 'X')
2551 numberresult[1] = 'x';
2552 else if (numberresult[1] != 'x') {
2553 memmove(numberresult + 2, numberresult,
2554 strlen(numberresult) + 1);
2555 numberresult[0] = '0';
2556 numberresult[1] = 'x';
2557 numprinted += 2;
2558 }
2559 n += numprinted;
2560 numberresult += (numprinted + 1);
2561 assert(*(numberresult - 1) == '\0');
2562 assert(*(numberresult - 2) != '\0');
2563 assert(numprinted >= 0);
2564 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002565 break;
2566 case 's':
2567 {
2568 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002569 const char *s = va_arg(count, const char*);
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002570 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002571 if (!str)
2572 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002573 /* since PyUnicode_DecodeUTF8 returns already flexible
2574 unicode objects, there is no need to call ready on them */
2575 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Victor Stinnere6abb482012-05-02 01:15:40 +02002576 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002577 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002578 /* Remember the str and switch to the next slot */
2579 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002580 break;
2581 }
2582 case 'U':
2583 {
2584 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002585 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002586 if (PyUnicode_READY(obj) == -1)
2587 goto fail;
2588 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Victor Stinnere6abb482012-05-02 01:15:40 +02002589 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002590 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002591 break;
2592 }
2593 case 'V':
2594 {
2595 PyObject *obj = va_arg(count, PyObject *);
2596 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002597 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002598 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002599 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002600 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002601 if (PyUnicode_READY(obj) == -1)
2602 goto fail;
2603 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Victor Stinnere6abb482012-05-02 01:15:40 +02002604 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002605 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002606 *callresult++ = NULL;
2607 }
2608 else {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002609 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002610 if (!str_obj)
2611 goto fail;
Benjamin Petersonbac79492012-01-14 13:34:47 -05002612 if (PyUnicode_READY(str_obj) == -1) {
Victor Stinnere1335c72011-10-04 20:53:03 +02002613 Py_DECREF(str_obj);
2614 goto fail;
2615 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002616 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Victor Stinnere6abb482012-05-02 01:15:40 +02002617 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002618 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002619 *callresult++ = str_obj;
2620 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002621 break;
2622 }
2623 case 'S':
2624 {
2625 PyObject *obj = va_arg(count, PyObject *);
2626 PyObject *str;
2627 assert(obj);
2628 str = PyObject_Str(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002629 if (!str)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002630 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002631 if (PyUnicode_READY(str) == -1) {
2632 Py_DECREF(str);
2633 goto fail;
2634 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002635 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Victor Stinnere6abb482012-05-02 01:15:40 +02002636 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002637 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002638 /* Remember the str and switch to the next slot */
2639 *callresult++ = str;
2640 break;
2641 }
2642 case 'R':
2643 {
2644 PyObject *obj = va_arg(count, PyObject *);
2645 PyObject *repr;
2646 assert(obj);
2647 repr = PyObject_Repr(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002648 if (!repr)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002649 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002650 if (PyUnicode_READY(repr) == -1) {
2651 Py_DECREF(repr);
2652 goto fail;
2653 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002654 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Victor Stinnere6abb482012-05-02 01:15:40 +02002655 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002656 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002657 /* Remember the repr and switch to the next slot */
2658 *callresult++ = repr;
2659 break;
2660 }
2661 case 'A':
2662 {
2663 PyObject *obj = va_arg(count, PyObject *);
2664 PyObject *ascii;
2665 assert(obj);
2666 ascii = PyObject_ASCII(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002667 if (!ascii)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002668 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002669 if (PyUnicode_READY(ascii) == -1) {
2670 Py_DECREF(ascii);
2671 goto fail;
2672 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002673 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Victor Stinnere6abb482012-05-02 01:15:40 +02002674 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002675 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002676 /* Remember the repr and switch to the next slot */
2677 *callresult++ = ascii;
2678 break;
2679 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002680 default:
2681 /* if we stumble upon an unknown
2682 formatting code, copy the rest of
2683 the format string to the output
2684 string. (we cannot just skip the
2685 code, since there's no way to know
2686 what's in the argument list) */
2687 n += strlen(p);
2688 goto expand;
2689 }
2690 } else
2691 n++;
2692 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002693 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002694 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002695 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002696 we don't have to resize the string.
2697 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002698 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002699 if (!string)
2700 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002701 kind = PyUnicode_KIND(string);
2702 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002703 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002704 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002705
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002706 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002707 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002708 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002709
2710 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002711 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2712 /* checking for == because the last argument could be a empty
2713 string, which causes i to point to end, the assert at the end of
2714 the loop */
2715 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002716
Benjamin Peterson14339b62009-01-31 16:36:08 +00002717 switch (*f) {
2718 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002719 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002720 const int ordinal = va_arg(vargs, int);
2721 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002722 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002723 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002724 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002725 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002726 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002727 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002728 case 'p':
Victor Stinnerc5166102012-02-22 13:55:02 +01002729 {
Victor Stinner184252a2012-06-16 02:57:41 +02002730 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002731 /* unused, since we already have the result */
2732 if (*f == 'p')
2733 (void) va_arg(vargs, void *);
2734 else
2735 (void) va_arg(vargs, int);
2736 /* extract the result from numberresults and append. */
Victor Stinner184252a2012-06-16 02:57:41 +02002737 len = strlen(numberresult);
2738 unicode_write_cstr(string, i, numberresult, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002739 /* skip over the separating '\0' */
Victor Stinner184252a2012-06-16 02:57:41 +02002740 i += len;
2741 numberresult += len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002742 assert(*numberresult == '\0');
2743 numberresult++;
2744 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002745 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002746 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002747 case 's':
2748 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002749 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002750 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002751 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002752 size = PyUnicode_GET_LENGTH(*callresult);
2753 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerd3f08822012-05-29 12:57:52 +02002754 _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002755 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002756 /* We're done with the unicode()/repr() => forget it */
2757 Py_DECREF(*callresult);
2758 /* switch to next unicode()/repr() result */
2759 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002760 break;
2761 }
2762 case 'U':
2763 {
2764 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002765 Py_ssize_t size;
2766 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2767 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerd3f08822012-05-29 12:57:52 +02002768 _PyUnicode_FastCopyCharacters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002769 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002770 break;
2771 }
2772 case 'V':
2773 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002774 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002775 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002776 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002777 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002778 size = PyUnicode_GET_LENGTH(obj);
2779 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerd3f08822012-05-29 12:57:52 +02002780 _PyUnicode_FastCopyCharacters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002781 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002782 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002783 size = PyUnicode_GET_LENGTH(*callresult);
2784 assert(PyUnicode_KIND(*callresult) <=
2785 PyUnicode_KIND(string));
Victor Stinnerd3f08822012-05-29 12:57:52 +02002786 _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002787 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002788 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002789 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002790 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002791 break;
2792 }
2793 case 'S':
2794 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002795 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002796 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002797 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002798 /* unused, since we already have the result */
2799 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002800 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerd3f08822012-05-29 12:57:52 +02002801 _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002802 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002803 /* We're done with the unicode()/repr() => forget it */
2804 Py_DECREF(*callresult);
2805 /* switch to next unicode()/repr() result */
2806 ++callresult;
2807 break;
2808 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002809 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002810 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002811 break;
2812 default:
Victor Stinner184252a2012-06-16 02:57:41 +02002813 {
2814 Py_ssize_t len = strlen(p);
2815 unicode_write_cstr(string, i, p, len);
2816 i += len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002817 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002818 goto end;
2819 }
Victor Stinner184252a2012-06-16 02:57:41 +02002820 }
Victor Stinner1205f272010-09-11 00:54:47 +00002821 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002822 else {
2823 assert(i < PyUnicode_GET_LENGTH(string));
2824 PyUnicode_WRITE(kind, data, i++, *f);
2825 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002826 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002827 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002828
Benjamin Peterson29060642009-01-31 22:14:21 +00002829 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002830 if (callresults)
2831 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002832 if (numberresults)
2833 PyObject_Free(numberresults);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002834 return unicode_result(string);
Benjamin Peterson29060642009-01-31 22:14:21 +00002835 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002836 if (callresults) {
2837 PyObject **callresult2 = callresults;
2838 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002839 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002840 ++callresult2;
2841 }
2842 PyObject_Free(callresults);
2843 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002844 if (numberresults)
2845 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002846 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002847}
2848
Walter Dörwaldd2034312007-05-18 16:29:38 +00002849PyObject *
2850PyUnicode_FromFormat(const char *format, ...)
2851{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002852 PyObject* ret;
2853 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002854
2855#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002856 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002857#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002858 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002859#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002860 ret = PyUnicode_FromFormatV(format, vargs);
2861 va_end(vargs);
2862 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002863}
2864
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002865#ifdef HAVE_WCHAR_H
2866
Victor Stinner5593d8a2010-10-02 11:11:27 +00002867/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2868 convert a Unicode object to a wide character string.
2869
Victor Stinnerd88d9832011-09-06 02:00:05 +02002870 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002871 character) required to convert the unicode object. Ignore size argument.
2872
Victor Stinnerd88d9832011-09-06 02:00:05 +02002873 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002874 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002875 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002876static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002877unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002878 wchar_t *w,
2879 Py_ssize_t size)
2880{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002881 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002882 const wchar_t *wstr;
2883
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002884 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002885 if (wstr == NULL)
2886 return -1;
2887
Victor Stinner5593d8a2010-10-02 11:11:27 +00002888 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002889 if (size > res)
2890 size = res + 1;
2891 else
2892 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002893 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002894 return res;
2895 }
2896 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002897 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002898}
2899
2900Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002901PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002902 wchar_t *w,
2903 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002904{
2905 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002906 PyErr_BadInternalCall();
2907 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002908 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002909 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002910}
2911
Victor Stinner137c34c2010-09-29 10:25:54 +00002912wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002913PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002914 Py_ssize_t *size)
2915{
2916 wchar_t* buffer;
2917 Py_ssize_t buflen;
2918
2919 if (unicode == NULL) {
2920 PyErr_BadInternalCall();
2921 return NULL;
2922 }
2923
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002924 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002925 if (buflen == -1)
2926 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002927 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002928 PyErr_NoMemory();
2929 return NULL;
2930 }
2931
Victor Stinner137c34c2010-09-29 10:25:54 +00002932 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2933 if (buffer == NULL) {
2934 PyErr_NoMemory();
2935 return NULL;
2936 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002937 buflen = unicode_aswidechar(unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002938 if (buflen == -1)
2939 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002940 if (size != NULL)
2941 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002942 return buffer;
2943}
2944
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002945#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002946
Alexander Belopolsky40018472011-02-26 01:02:56 +00002947PyObject *
2948PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002949{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002950 PyObject *v;
Victor Stinner8faf8212011-12-08 22:14:11 +01002951 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002952 PyErr_SetString(PyExc_ValueError,
2953 "chr() arg not in range(0x110000)");
2954 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002955 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002956
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002957 if (ordinal < 256)
2958 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002959
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002960 v = PyUnicode_New(1, ordinal);
2961 if (v == NULL)
2962 return NULL;
2963 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002964 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002965 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002966}
2967
Alexander Belopolsky40018472011-02-26 01:02:56 +00002968PyObject *
2969PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002970{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002971 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002972 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002973 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002974 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002975 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002976 Py_INCREF(obj);
2977 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002978 }
2979 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002980 /* For a Unicode subtype that's not a Unicode object,
2981 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002982 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002983 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002984 PyErr_Format(PyExc_TypeError,
2985 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002986 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002987 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002988}
2989
Alexander Belopolsky40018472011-02-26 01:02:56 +00002990PyObject *
2991PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002992 const char *encoding,
2993 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002994{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002995 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002996 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002997
Guido van Rossumd57fd912000-03-10 22:53:23 +00002998 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002999 PyErr_BadInternalCall();
3000 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003001 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003002
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003003 /* Decoding bytes objects is the most common case and should be fast */
3004 if (PyBytes_Check(obj)) {
3005 if (PyBytes_GET_SIZE(obj) == 0) {
3006 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02003007 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003008 }
3009 else {
3010 v = PyUnicode_Decode(
3011 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3012 encoding, errors);
3013 }
3014 return v;
3015 }
3016
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003017 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003018 PyErr_SetString(PyExc_TypeError,
3019 "decoding str is not supported");
3020 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003021 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003022
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003023 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3024 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3025 PyErr_Format(PyExc_TypeError,
3026 "coercing to str: need bytes, bytearray "
3027 "or buffer-like object, %.80s found",
3028 Py_TYPE(obj)->tp_name);
3029 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003030 }
Tim Petersced69f82003-09-16 20:30:58 +00003031
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003032 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003033 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02003034 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003035 }
Tim Petersced69f82003-09-16 20:30:58 +00003036 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003037 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003038
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003039 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003040 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003041}
3042
Victor Stinner600d3be2010-06-10 12:00:55 +00003043/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00003044 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
3045 1 on success. */
3046static int
3047normalize_encoding(const char *encoding,
3048 char *lower,
3049 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003050{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003051 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003052 char *l;
3053 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003054
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04003055 if (encoding == NULL) {
3056 strcpy(lower, "utf-8");
3057 return 1;
3058 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003059 e = encoding;
3060 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003061 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00003062 while (*e) {
3063 if (l == l_end)
3064 return 0;
David Malcolm96960882010-11-05 17:23:41 +00003065 if (Py_ISUPPER(*e)) {
3066 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003067 }
3068 else if (*e == '_') {
3069 *l++ = '-';
3070 e++;
3071 }
3072 else {
3073 *l++ = *e++;
3074 }
3075 }
3076 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003077 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003078}
3079
Alexander Belopolsky40018472011-02-26 01:02:56 +00003080PyObject *
3081PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003082 Py_ssize_t size,
3083 const char *encoding,
3084 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003085{
3086 PyObject *buffer = NULL, *unicode;
3087 Py_buffer info;
3088 char lower[11]; /* Enough for any encoding shortcut */
3089
Fred Drakee4315f52000-05-09 19:53:39 +00003090 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003091 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003092 if ((strcmp(lower, "utf-8") == 0) ||
3093 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003094 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00003095 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003096 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003097 (strcmp(lower, "iso-8859-1") == 0))
3098 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003099#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003100 else if (strcmp(lower, "mbcs") == 0)
3101 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003102#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003103 else if (strcmp(lower, "ascii") == 0)
3104 return PyUnicode_DecodeASCII(s, size, errors);
3105 else if (strcmp(lower, "utf-16") == 0)
3106 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3107 else if (strcmp(lower, "utf-32") == 0)
3108 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3109 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003110
3111 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003112 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003113 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003114 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003115 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003116 if (buffer == NULL)
3117 goto onError;
3118 unicode = PyCodec_Decode(buffer, encoding, errors);
3119 if (unicode == NULL)
3120 goto onError;
3121 if (!PyUnicode_Check(unicode)) {
3122 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003123 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00003124 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003125 Py_DECREF(unicode);
3126 goto onError;
3127 }
3128 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003129 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003130
Benjamin Peterson29060642009-01-31 22:14:21 +00003131 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003132 Py_XDECREF(buffer);
3133 return NULL;
3134}
3135
Alexander Belopolsky40018472011-02-26 01:02:56 +00003136PyObject *
3137PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003138 const char *encoding,
3139 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003140{
3141 PyObject *v;
3142
3143 if (!PyUnicode_Check(unicode)) {
3144 PyErr_BadArgument();
3145 goto onError;
3146 }
3147
3148 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003149 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003150
3151 /* Decode via the codec registry */
3152 v = PyCodec_Decode(unicode, encoding, errors);
3153 if (v == NULL)
3154 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003155 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003156
Benjamin Peterson29060642009-01-31 22:14:21 +00003157 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003158 return NULL;
3159}
3160
Alexander Belopolsky40018472011-02-26 01:02:56 +00003161PyObject *
3162PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003163 const char *encoding,
3164 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003165{
3166 PyObject *v;
3167
3168 if (!PyUnicode_Check(unicode)) {
3169 PyErr_BadArgument();
3170 goto onError;
3171 }
3172
3173 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003174 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003175
3176 /* Decode via the codec registry */
3177 v = PyCodec_Decode(unicode, encoding, errors);
3178 if (v == NULL)
3179 goto onError;
3180 if (!PyUnicode_Check(v)) {
3181 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003182 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003183 Py_TYPE(v)->tp_name);
3184 Py_DECREF(v);
3185 goto onError;
3186 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003187 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003188
Benjamin Peterson29060642009-01-31 22:14:21 +00003189 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003190 return NULL;
3191}
3192
Alexander Belopolsky40018472011-02-26 01:02:56 +00003193PyObject *
3194PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003195 Py_ssize_t size,
3196 const char *encoding,
3197 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003198{
3199 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003200
Guido van Rossumd57fd912000-03-10 22:53:23 +00003201 unicode = PyUnicode_FromUnicode(s, size);
3202 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003203 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003204 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3205 Py_DECREF(unicode);
3206 return v;
3207}
3208
Alexander Belopolsky40018472011-02-26 01:02:56 +00003209PyObject *
3210PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003211 const char *encoding,
3212 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003213{
3214 PyObject *v;
3215
3216 if (!PyUnicode_Check(unicode)) {
3217 PyErr_BadArgument();
3218 goto onError;
3219 }
3220
3221 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003222 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003223
3224 /* Encode via the codec registry */
3225 v = PyCodec_Encode(unicode, encoding, errors);
3226 if (v == NULL)
3227 goto onError;
3228 return v;
3229
Benjamin Peterson29060642009-01-31 22:14:21 +00003230 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003231 return NULL;
3232}
3233
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003234static size_t
3235wcstombs_errorpos(const wchar_t *wstr)
3236{
3237 size_t len;
3238#if SIZEOF_WCHAR_T == 2
3239 wchar_t buf[3];
3240#else
3241 wchar_t buf[2];
3242#endif
3243 char outbuf[MB_LEN_MAX];
3244 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003245
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003246#if SIZEOF_WCHAR_T == 2
3247 buf[2] = 0;
3248#else
3249 buf[1] = 0;
3250#endif
3251 start = wstr;
3252 while (*wstr != L'\0')
3253 {
3254 previous = wstr;
3255#if SIZEOF_WCHAR_T == 2
3256 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3257 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3258 {
3259 buf[0] = wstr[0];
3260 buf[1] = wstr[1];
3261 wstr += 2;
3262 }
3263 else {
3264 buf[0] = *wstr;
3265 buf[1] = 0;
3266 wstr++;
3267 }
3268#else
3269 buf[0] = *wstr;
3270 wstr++;
3271#endif
3272 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003273 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003274 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003275 }
3276
3277 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003278 return 0;
3279}
3280
Victor Stinner1b579672011-12-17 05:47:23 +01003281static int
3282locale_error_handler(const char *errors, int *surrogateescape)
3283{
3284 if (errors == NULL) {
3285 *surrogateescape = 0;
3286 return 0;
3287 }
3288
3289 if (strcmp(errors, "strict") == 0) {
3290 *surrogateescape = 0;
3291 return 0;
3292 }
3293 if (strcmp(errors, "surrogateescape") == 0) {
3294 *surrogateescape = 1;
3295 return 0;
3296 }
3297 PyErr_Format(PyExc_ValueError,
3298 "only 'strict' and 'surrogateescape' error handlers "
3299 "are supported, not '%s'",
3300 errors);
3301 return -1;
3302}
3303
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003304PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003305PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003306{
3307 Py_ssize_t wlen, wlen2;
3308 wchar_t *wstr;
3309 PyObject *bytes = NULL;
3310 char *errmsg;
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003311 PyObject *reason;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003312 PyObject *exc;
3313 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003314 int surrogateescape;
3315
3316 if (locale_error_handler(errors, &surrogateescape) < 0)
3317 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003318
3319 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3320 if (wstr == NULL)
3321 return NULL;
3322
3323 wlen2 = wcslen(wstr);
3324 if (wlen2 != wlen) {
3325 PyMem_Free(wstr);
3326 PyErr_SetString(PyExc_TypeError, "embedded null character");
3327 return NULL;
3328 }
3329
3330 if (surrogateescape) {
3331 /* locale encoding with surrogateescape */
3332 char *str;
3333
3334 str = _Py_wchar2char(wstr, &error_pos);
3335 if (str == NULL) {
3336 if (error_pos == (size_t)-1) {
3337 PyErr_NoMemory();
3338 PyMem_Free(wstr);
3339 return NULL;
3340 }
3341 else {
3342 goto encode_error;
3343 }
3344 }
3345 PyMem_Free(wstr);
3346
3347 bytes = PyBytes_FromString(str);
3348 PyMem_Free(str);
3349 }
3350 else {
3351 size_t len, len2;
3352
3353 len = wcstombs(NULL, wstr, 0);
3354 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003355 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003356 goto encode_error;
3357 }
3358
3359 bytes = PyBytes_FromStringAndSize(NULL, len);
3360 if (bytes == NULL) {
3361 PyMem_Free(wstr);
3362 return NULL;
3363 }
3364
3365 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3366 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003367 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003368 goto encode_error;
3369 }
3370 PyMem_Free(wstr);
3371 }
3372 return bytes;
3373
3374encode_error:
3375 errmsg = strerror(errno);
3376 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003377
3378 if (error_pos == (size_t)-1)
3379 error_pos = wcstombs_errorpos(wstr);
3380
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003381 PyMem_Free(wstr);
3382 Py_XDECREF(bytes);
3383
Victor Stinner2f197072011-12-17 07:08:30 +01003384 if (errmsg != NULL) {
3385 size_t errlen;
3386 wstr = _Py_char2wchar(errmsg, &errlen);
3387 if (wstr != NULL) {
3388 reason = PyUnicode_FromWideChar(wstr, errlen);
3389 PyMem_Free(wstr);
3390 } else
3391 errmsg = NULL;
3392 }
3393 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003394 reason = PyUnicode_FromString(
3395 "wcstombs() encountered an unencodable "
3396 "wide character");
3397 if (reason == NULL)
3398 return NULL;
3399
3400 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3401 "locale", unicode,
3402 (Py_ssize_t)error_pos,
3403 (Py_ssize_t)(error_pos+1),
3404 reason);
3405 Py_DECREF(reason);
3406 if (exc != NULL) {
3407 PyCodec_StrictErrors(exc);
3408 Py_XDECREF(exc);
3409 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003410 return NULL;
3411}
3412
Victor Stinnerad158722010-10-27 00:25:46 +00003413PyObject *
3414PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003415{
Victor Stinner99b95382011-07-04 14:23:54 +02003416#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003417 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003418#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003419 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003420#else
Victor Stinner793b5312011-04-27 00:24:21 +02003421 PyInterpreterState *interp = PyThreadState_GET()->interp;
3422 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3423 cannot use it to encode and decode filenames before it is loaded. Load
3424 the Python codec requires to encode at least its own filename. Use the C
3425 version of the locale codec until the codec registry is initialized and
3426 the Python codec is loaded.
3427
3428 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3429 cannot only rely on it: check also interp->fscodec_initialized for
3430 subinterpreters. */
3431 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003432 return PyUnicode_AsEncodedString(unicode,
3433 Py_FileSystemDefaultEncoding,
3434 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003435 }
3436 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003437 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003438 }
Victor Stinnerad158722010-10-27 00:25:46 +00003439#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003440}
3441
Alexander Belopolsky40018472011-02-26 01:02:56 +00003442PyObject *
3443PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003444 const char *encoding,
3445 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003446{
3447 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003448 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003449
Guido van Rossumd57fd912000-03-10 22:53:23 +00003450 if (!PyUnicode_Check(unicode)) {
3451 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003452 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003453 }
Fred Drakee4315f52000-05-09 19:53:39 +00003454
Fred Drakee4315f52000-05-09 19:53:39 +00003455 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003456 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003457 if ((strcmp(lower, "utf-8") == 0) ||
3458 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003459 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003460 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003461 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003462 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003463 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003464 }
Victor Stinner37296e82010-06-10 13:36:23 +00003465 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003466 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003467 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003468 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003469#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003470 else if (strcmp(lower, "mbcs") == 0)
3471 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003472#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003473 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003474 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003475 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003476
3477 /* Encode via the codec registry */
3478 v = PyCodec_Encode(unicode, encoding, errors);
3479 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003480 return NULL;
3481
3482 /* The normal path */
3483 if (PyBytes_Check(v))
3484 return v;
3485
3486 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003487 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003488 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003489 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003490
3491 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3492 "encoder %s returned bytearray instead of bytes",
3493 encoding);
3494 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003495 Py_DECREF(v);
3496 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003497 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003498
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003499 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3500 Py_DECREF(v);
3501 return b;
3502 }
3503
3504 PyErr_Format(PyExc_TypeError,
3505 "encoder did not return a bytes object (type=%.400s)",
3506 Py_TYPE(v)->tp_name);
3507 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003508 return NULL;
3509}
3510
Alexander Belopolsky40018472011-02-26 01:02:56 +00003511PyObject *
3512PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003513 const char *encoding,
3514 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003515{
3516 PyObject *v;
3517
3518 if (!PyUnicode_Check(unicode)) {
3519 PyErr_BadArgument();
3520 goto onError;
3521 }
3522
3523 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003524 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003525
3526 /* Encode via the codec registry */
3527 v = PyCodec_Encode(unicode, encoding, errors);
3528 if (v == NULL)
3529 goto onError;
3530 if (!PyUnicode_Check(v)) {
3531 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003532 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003533 Py_TYPE(v)->tp_name);
3534 Py_DECREF(v);
3535 goto onError;
3536 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003537 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003538
Benjamin Peterson29060642009-01-31 22:14:21 +00003539 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003540 return NULL;
3541}
3542
Victor Stinner2f197072011-12-17 07:08:30 +01003543static size_t
3544mbstowcs_errorpos(const char *str, size_t len)
3545{
3546#ifdef HAVE_MBRTOWC
3547 const char *start = str;
3548 mbstate_t mbs;
3549 size_t converted;
3550 wchar_t ch;
3551
3552 memset(&mbs, 0, sizeof mbs);
3553 while (len)
3554 {
3555 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3556 if (converted == 0)
3557 /* Reached end of string */
3558 break;
3559 if (converted == (size_t)-1 || converted == (size_t)-2) {
3560 /* Conversion error or incomplete character */
3561 return str - start;
3562 }
3563 else {
3564 str += converted;
3565 len -= converted;
3566 }
3567 }
3568 /* failed to find the undecodable byte sequence */
3569 return 0;
3570#endif
3571 return 0;
3572}
3573
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003574PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003575PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003576 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003577{
3578 wchar_t smallbuf[256];
3579 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3580 wchar_t *wstr;
3581 size_t wlen, wlen2;
3582 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003583 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003584 size_t error_pos;
3585 char *errmsg;
3586 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003587
3588 if (locale_error_handler(errors, &surrogateescape) < 0)
3589 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003590
3591 if (str[len] != '\0' || len != strlen(str)) {
3592 PyErr_SetString(PyExc_TypeError, "embedded null character");
3593 return NULL;
3594 }
3595
3596 if (surrogateescape)
3597 {
3598 wstr = _Py_char2wchar(str, &wlen);
3599 if (wstr == NULL) {
3600 if (wlen == (size_t)-1)
3601 PyErr_NoMemory();
3602 else
3603 PyErr_SetFromErrno(PyExc_OSError);
3604 return NULL;
3605 }
3606
3607 unicode = PyUnicode_FromWideChar(wstr, wlen);
3608 PyMem_Free(wstr);
3609 }
3610 else {
3611#ifndef HAVE_BROKEN_MBSTOWCS
3612 wlen = mbstowcs(NULL, str, 0);
3613#else
3614 wlen = len;
3615#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003616 if (wlen == (size_t)-1)
3617 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003618 if (wlen+1 <= smallbuf_len) {
3619 wstr = smallbuf;
3620 }
3621 else {
3622 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3623 return PyErr_NoMemory();
3624
3625 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3626 if (!wstr)
3627 return PyErr_NoMemory();
3628 }
3629
3630 /* This shouldn't fail now */
3631 wlen2 = mbstowcs(wstr, str, wlen+1);
3632 if (wlen2 == (size_t)-1) {
3633 if (wstr != smallbuf)
3634 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003635 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003636 }
3637#ifdef HAVE_BROKEN_MBSTOWCS
3638 assert(wlen2 == wlen);
3639#endif
3640 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3641 if (wstr != smallbuf)
3642 PyMem_Free(wstr);
3643 }
3644 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003645
3646decode_error:
3647 errmsg = strerror(errno);
3648 assert(errmsg != NULL);
3649
3650 error_pos = mbstowcs_errorpos(str, len);
3651 if (errmsg != NULL) {
3652 size_t errlen;
3653 wstr = _Py_char2wchar(errmsg, &errlen);
3654 if (wstr != NULL) {
3655 reason = PyUnicode_FromWideChar(wstr, errlen);
3656 PyMem_Free(wstr);
3657 } else
3658 errmsg = NULL;
3659 }
3660 if (errmsg == NULL)
3661 reason = PyUnicode_FromString(
3662 "mbstowcs() encountered an invalid multibyte sequence");
3663 if (reason == NULL)
3664 return NULL;
3665
3666 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3667 "locale", str, len,
3668 (Py_ssize_t)error_pos,
3669 (Py_ssize_t)(error_pos+1),
3670 reason);
3671 Py_DECREF(reason);
3672 if (exc != NULL) {
3673 PyCodec_StrictErrors(exc);
3674 Py_XDECREF(exc);
3675 }
3676 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003677}
3678
3679PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003680PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003681{
3682 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003683 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003684}
3685
3686
3687PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003688PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003689 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003690 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3691}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003692
Christian Heimes5894ba72007-11-04 11:43:14 +00003693PyObject*
3694PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3695{
Victor Stinner99b95382011-07-04 14:23:54 +02003696#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003697 return PyUnicode_DecodeMBCS(s, size, NULL);
3698#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003699 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003700#else
Victor Stinner793b5312011-04-27 00:24:21 +02003701 PyInterpreterState *interp = PyThreadState_GET()->interp;
3702 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3703 cannot use it to encode and decode filenames before it is loaded. Load
3704 the Python codec requires to encode at least its own filename. Use the C
3705 version of the locale codec until the codec registry is initialized and
3706 the Python codec is loaded.
3707
3708 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3709 cannot only rely on it: check also interp->fscodec_initialized for
3710 subinterpreters. */
3711 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003712 return PyUnicode_Decode(s, size,
3713 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003714 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003715 }
3716 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003717 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003718 }
Victor Stinnerad158722010-10-27 00:25:46 +00003719#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003720}
3721
Martin v. Löwis011e8422009-05-05 04:43:17 +00003722
3723int
Antoine Pitrou13348842012-01-29 18:36:34 +01003724_PyUnicode_HasNULChars(PyObject* s)
3725{
3726 static PyObject *nul = NULL;
3727
3728 if (nul == NULL)
3729 nul = PyUnicode_FromStringAndSize("\0", 1);
3730 if (nul == NULL)
3731 return -1;
3732 return PyUnicode_Contains(s, nul);
3733}
3734
3735
3736int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003737PyUnicode_FSConverter(PyObject* arg, void* addr)
3738{
3739 PyObject *output = NULL;
3740 Py_ssize_t size;
3741 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003742 if (arg == NULL) {
3743 Py_DECREF(*(PyObject**)addr);
3744 return 1;
3745 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003746 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003747 output = arg;
3748 Py_INCREF(output);
3749 }
3750 else {
3751 arg = PyUnicode_FromObject(arg);
3752 if (!arg)
3753 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003754 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003755 Py_DECREF(arg);
3756 if (!output)
3757 return 0;
3758 if (!PyBytes_Check(output)) {
3759 Py_DECREF(output);
3760 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3761 return 0;
3762 }
3763 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003764 size = PyBytes_GET_SIZE(output);
3765 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003766 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003767 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003768 Py_DECREF(output);
3769 return 0;
3770 }
3771 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003772 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003773}
3774
3775
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003776int
3777PyUnicode_FSDecoder(PyObject* arg, void* addr)
3778{
3779 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003780 if (arg == NULL) {
3781 Py_DECREF(*(PyObject**)addr);
3782 return 1;
3783 }
3784 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003785 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003786 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003787 output = arg;
3788 Py_INCREF(output);
3789 }
3790 else {
3791 arg = PyBytes_FromObject(arg);
3792 if (!arg)
3793 return 0;
3794 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3795 PyBytes_GET_SIZE(arg));
3796 Py_DECREF(arg);
3797 if (!output)
3798 return 0;
3799 if (!PyUnicode_Check(output)) {
3800 Py_DECREF(output);
3801 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3802 return 0;
3803 }
3804 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003805 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003806 Py_DECREF(output);
3807 return 0;
3808 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003809 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003810 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003811 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3812 Py_DECREF(output);
3813 return 0;
3814 }
3815 *(PyObject**)addr = output;
3816 return Py_CLEANUP_SUPPORTED;
3817}
3818
3819
Martin v. Löwis5b222132007-06-10 09:51:05 +00003820char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003821PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003822{
Christian Heimesf3863112007-11-22 07:46:41 +00003823 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003824
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003825 if (!PyUnicode_Check(unicode)) {
3826 PyErr_BadArgument();
3827 return NULL;
3828 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003829 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003830 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003831
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003832 if (PyUnicode_UTF8(unicode) == NULL) {
3833 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003834 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3835 if (bytes == NULL)
3836 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003837 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3838 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003839 Py_DECREF(bytes);
3840 return NULL;
3841 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003842 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3843 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3844 PyBytes_AS_STRING(bytes),
3845 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003846 Py_DECREF(bytes);
3847 }
3848
3849 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003850 *psize = PyUnicode_UTF8_LENGTH(unicode);
3851 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003852}
3853
3854char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003855PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003856{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003857 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3858}
3859
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003860Py_UNICODE *
3861PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3862{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003863 const unsigned char *one_byte;
3864#if SIZEOF_WCHAR_T == 4
3865 const Py_UCS2 *two_bytes;
3866#else
3867 const Py_UCS4 *four_bytes;
3868 const Py_UCS4 *ucs4_end;
3869 Py_ssize_t num_surrogates;
3870#endif
3871 wchar_t *w;
3872 wchar_t *wchar_end;
3873
3874 if (!PyUnicode_Check(unicode)) {
3875 PyErr_BadArgument();
3876 return NULL;
3877 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003878 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003879 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003880 assert(_PyUnicode_KIND(unicode) != 0);
3881 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003882
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003883 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003884#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003885 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3886 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003887 num_surrogates = 0;
3888
3889 for (; four_bytes < ucs4_end; ++four_bytes) {
3890 if (*four_bytes > 0xFFFF)
3891 ++num_surrogates;
3892 }
3893
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003894 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3895 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3896 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003897 PyErr_NoMemory();
3898 return NULL;
3899 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003900 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003901
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003902 w = _PyUnicode_WSTR(unicode);
3903 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3904 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003905 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3906 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003907 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003908 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003909 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3910 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003911 }
3912 else
3913 *w = *four_bytes;
3914
3915 if (w > wchar_end) {
3916 assert(0 && "Miscalculated string end");
3917 }
3918 }
3919 *w = 0;
3920#else
3921 /* sizeof(wchar_t) == 4 */
3922 Py_FatalError("Impossible unicode object state, wstr and str "
3923 "should share memory already.");
3924 return NULL;
3925#endif
3926 }
3927 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003928 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3929 (_PyUnicode_LENGTH(unicode) + 1));
3930 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003931 PyErr_NoMemory();
3932 return NULL;
3933 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003934 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3935 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3936 w = _PyUnicode_WSTR(unicode);
3937 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003938
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003939 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3940 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003941 for (; w < wchar_end; ++one_byte, ++w)
3942 *w = *one_byte;
3943 /* null-terminate the wstr */
3944 *w = 0;
3945 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003946 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003947#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003948 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003949 for (; w < wchar_end; ++two_bytes, ++w)
3950 *w = *two_bytes;
3951 /* null-terminate the wstr */
3952 *w = 0;
3953#else
3954 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003955 PyObject_FREE(_PyUnicode_WSTR(unicode));
3956 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003957 Py_FatalError("Impossible unicode object state, wstr "
3958 "and str should share memory already.");
3959 return NULL;
3960#endif
3961 }
3962 else {
3963 assert(0 && "This should never happen.");
3964 }
3965 }
3966 }
3967 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003968 *size = PyUnicode_WSTR_LENGTH(unicode);
3969 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003970}
3971
Alexander Belopolsky40018472011-02-26 01:02:56 +00003972Py_UNICODE *
3973PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003974{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003975 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003976}
3977
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003978
Alexander Belopolsky40018472011-02-26 01:02:56 +00003979Py_ssize_t
3980PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003981{
3982 if (!PyUnicode_Check(unicode)) {
3983 PyErr_BadArgument();
3984 goto onError;
3985 }
3986 return PyUnicode_GET_SIZE(unicode);
3987
Benjamin Peterson29060642009-01-31 22:14:21 +00003988 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003989 return -1;
3990}
3991
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003992Py_ssize_t
3993PyUnicode_GetLength(PyObject *unicode)
3994{
Victor Stinner07621332012-06-16 04:53:46 +02003995 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003996 PyErr_BadArgument();
3997 return -1;
3998 }
Victor Stinner07621332012-06-16 04:53:46 +02003999 if (PyUnicode_READY(unicode) == -1)
4000 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004001 return PyUnicode_GET_LENGTH(unicode);
4002}
4003
4004Py_UCS4
4005PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4006{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004007 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
4008 PyErr_BadArgument();
4009 return (Py_UCS4)-1;
4010 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004011 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004012 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004013 return (Py_UCS4)-1;
4014 }
4015 return PyUnicode_READ_CHAR(unicode, index);
4016}
4017
4018int
4019PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4020{
4021 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004022 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004023 return -1;
4024 }
Victor Stinner488fa492011-12-12 00:01:39 +01004025 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004026 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004027 PyErr_SetString(PyExc_IndexError, "string index out of range");
4028 return -1;
4029 }
Victor Stinner488fa492011-12-12 00:01:39 +01004030 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004031 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004032 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4033 PyErr_SetString(PyExc_ValueError, "character out of range");
4034 return -1;
4035 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004036 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4037 index, ch);
4038 return 0;
4039}
4040
Alexander Belopolsky40018472011-02-26 01:02:56 +00004041const char *
4042PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004043{
Victor Stinner42cb4622010-09-01 19:39:01 +00004044 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004045}
4046
Victor Stinner554f3f02010-06-16 23:33:54 +00004047/* create or adjust a UnicodeDecodeError */
4048static void
4049make_decode_exception(PyObject **exceptionObject,
4050 const char *encoding,
4051 const char *input, Py_ssize_t length,
4052 Py_ssize_t startpos, Py_ssize_t endpos,
4053 const char *reason)
4054{
4055 if (*exceptionObject == NULL) {
4056 *exceptionObject = PyUnicodeDecodeError_Create(
4057 encoding, input, length, startpos, endpos, reason);
4058 }
4059 else {
4060 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4061 goto onError;
4062 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4063 goto onError;
4064 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4065 goto onError;
4066 }
4067 return;
4068
4069onError:
4070 Py_DECREF(*exceptionObject);
4071 *exceptionObject = NULL;
4072}
4073
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004074/* error handling callback helper:
4075 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004076 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004077 and adjust various state variables.
4078 return 0 on success, -1 on error
4079*/
4080
Alexander Belopolsky40018472011-02-26 01:02:56 +00004081static int
4082unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004083 const char *encoding, const char *reason,
4084 const char **input, const char **inend, Py_ssize_t *startinpos,
4085 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004086 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004087{
Benjamin Peterson142957c2008-07-04 19:55:29 +00004088 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004089
4090 PyObject *restuple = NULL;
4091 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004092 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004093 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004094 Py_ssize_t requiredsize;
4095 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004096 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004097 int res = -1;
4098
Victor Stinner596a6c42011-11-09 00:02:18 +01004099 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
4100 outsize = PyUnicode_GET_LENGTH(*output);
4101 else
4102 outsize = _PyUnicode_WSTR_LENGTH(*output);
4103
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004104 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004105 *errorHandler = PyCodec_LookupError(errors);
4106 if (*errorHandler == NULL)
4107 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004108 }
4109
Victor Stinner554f3f02010-06-16 23:33:54 +00004110 make_decode_exception(exceptionObject,
4111 encoding,
4112 *input, *inend - *input,
4113 *startinpos, *endinpos,
4114 reason);
4115 if (*exceptionObject == NULL)
4116 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004117
4118 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4119 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004120 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004121 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004122 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004123 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004124 }
4125 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004126 goto onError;
Benjamin Petersonbac79492012-01-14 13:34:47 -05004127 if (PyUnicode_READY(repunicode) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004128 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004129
4130 /* Copy back the bytes variables, which might have been modified by the
4131 callback */
4132 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4133 if (!inputobj)
4134 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004135 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004136 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004137 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004138 *input = PyBytes_AS_STRING(inputobj);
4139 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004140 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004141 /* we can DECREF safely, as the exception has another reference,
4142 so the object won't go away. */
4143 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004144
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004145 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004146 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004147 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004148 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4149 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004150 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004151
Victor Stinner596a6c42011-11-09 00:02:18 +01004152 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
4153 /* need more space? (at least enough for what we
4154 have+the replacement+the rest of the string (starting
4155 at the new input position), so we won't have to check space
4156 when there are no errors in the rest of the string) */
4157 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
4158 requiredsize = *outpos + replen + insize-newpos;
4159 if (requiredsize > outsize) {
4160 if (requiredsize<2*outsize)
4161 requiredsize = 2*outsize;
4162 if (unicode_resize(output, requiredsize) < 0)
4163 goto onError;
4164 }
Victor Stinner1b487b42012-05-03 12:29:04 +02004165 if (unicode_widen(output, *outpos,
4166 PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004167 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +02004168 _PyUnicode_FastCopyCharacters(*output, *outpos, repunicode, 0, replen);
Victor Stinner596a6c42011-11-09 00:02:18 +01004169 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004170 }
Victor Stinner596a6c42011-11-09 00:02:18 +01004171 else {
4172 wchar_t *repwstr;
4173 Py_ssize_t repwlen;
4174 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4175 if (repwstr == NULL)
4176 goto onError;
4177 /* need more space? (at least enough for what we
4178 have+the replacement+the rest of the string (starting
4179 at the new input position), so we won't have to check space
4180 when there are no errors in the rest of the string) */
4181 requiredsize = *outpos + repwlen + insize-newpos;
4182 if (requiredsize > outsize) {
4183 if (requiredsize < 2*outsize)
4184 requiredsize = 2*outsize;
4185 if (unicode_resize(output, requiredsize) < 0)
4186 goto onError;
4187 }
4188 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4189 *outpos += repwlen;
4190 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004191 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004192 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004193
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004194 /* we made it! */
4195 res = 0;
4196
Benjamin Peterson29060642009-01-31 22:14:21 +00004197 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004198 Py_XDECREF(restuple);
4199 return res;
4200}
4201
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004202/* --- UTF-7 Codec -------------------------------------------------------- */
4203
Antoine Pitrou244651a2009-05-04 18:56:13 +00004204/* See RFC2152 for details. We encode conservatively and decode liberally. */
4205
4206/* Three simple macros defining base-64. */
4207
4208/* Is c a base-64 character? */
4209
4210#define IS_BASE64(c) \
4211 (((c) >= 'A' && (c) <= 'Z') || \
4212 ((c) >= 'a' && (c) <= 'z') || \
4213 ((c) >= '0' && (c) <= '9') || \
4214 (c) == '+' || (c) == '/')
4215
4216/* given that c is a base-64 character, what is its base-64 value? */
4217
4218#define FROM_BASE64(c) \
4219 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4220 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4221 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4222 (c) == '+' ? 62 : 63)
4223
4224/* What is the base-64 character of the bottom 6 bits of n? */
4225
4226#define TO_BASE64(n) \
4227 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4228
4229/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4230 * decoded as itself. We are permissive on decoding; the only ASCII
4231 * byte not decoding to itself is the + which begins a base64
4232 * string. */
4233
4234#define DECODE_DIRECT(c) \
4235 ((c) <= 127 && (c) != '+')
4236
4237/* The UTF-7 encoder treats ASCII characters differently according to
4238 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4239 * the above). See RFC2152. This array identifies these different
4240 * sets:
4241 * 0 : "Set D"
4242 * alphanumeric and '(),-./:?
4243 * 1 : "Set O"
4244 * !"#$%&*;<=>@[]^_`{|}
4245 * 2 : "whitespace"
4246 * ht nl cr sp
4247 * 3 : special (must be base64 encoded)
4248 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4249 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004250
Tim Petersced69f82003-09-16 20:30:58 +00004251static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004252char utf7_category[128] = {
4253/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4254 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4255/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4256 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4257/* sp ! " # $ % & ' ( ) * + , - . / */
4258 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4259/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4260 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4261/* @ A B C D E F G H I J K L M N O */
4262 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4263/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4264 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4265/* ` a b c d e f g h i j k l m n o */
4266 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4267/* p q r s t u v w x y z { | } ~ del */
4268 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004269};
4270
Antoine Pitrou244651a2009-05-04 18:56:13 +00004271/* ENCODE_DIRECT: this character should be encoded as itself. The
4272 * answer depends on whether we are encoding set O as itself, and also
4273 * on whether we are encoding whitespace as itself. RFC2152 makes it
4274 * clear that the answers to these questions vary between
4275 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004276
Antoine Pitrou244651a2009-05-04 18:56:13 +00004277#define ENCODE_DIRECT(c, directO, directWS) \
4278 ((c) < 128 && (c) > 0 && \
4279 ((utf7_category[(c)] == 0) || \
4280 (directWS && (utf7_category[(c)] == 2)) || \
4281 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004282
Alexander Belopolsky40018472011-02-26 01:02:56 +00004283PyObject *
4284PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004285 Py_ssize_t size,
4286 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004287{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004288 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4289}
4290
Antoine Pitrou244651a2009-05-04 18:56:13 +00004291/* The decoder. The only state we preserve is our read position,
4292 * i.e. how many characters we have consumed. So if we end in the
4293 * middle of a shift sequence we have to back off the read position
4294 * and the output to the beginning of the sequence, otherwise we lose
4295 * all the shift state (seen bits, number of bits seen, high
4296 * surrogate). */
4297
Alexander Belopolsky40018472011-02-26 01:02:56 +00004298PyObject *
4299PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004300 Py_ssize_t size,
4301 const char *errors,
4302 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004303{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004304 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004305 Py_ssize_t startinpos;
4306 Py_ssize_t endinpos;
4307 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004308 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004309 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004310 const char *errmsg = "";
4311 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004312 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004313 unsigned int base64bits = 0;
4314 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004315 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004316 PyObject *errorHandler = NULL;
4317 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004318
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004319 /* Start off assuming it's all ASCII. Widen later as necessary. */
4320 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004321 if (!unicode)
4322 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004323 if (size == 0) {
4324 if (consumed)
4325 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004326 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004327 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004328
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004329 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004330 e = s + size;
4331
4332 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004333 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004334 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004335 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004336
Antoine Pitrou244651a2009-05-04 18:56:13 +00004337 if (inShift) { /* in a base-64 section */
4338 if (IS_BASE64(ch)) { /* consume a base-64 character */
4339 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4340 base64bits += 6;
4341 s++;
4342 if (base64bits >= 16) {
4343 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004344 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004345 base64bits -= 16;
4346 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4347 if (surrogate) {
4348 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004349 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4350 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004351 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
4352 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004353 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004354 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004355 }
4356 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004357 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4358 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004359 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004360 }
4361 }
Victor Stinner551ac952011-11-29 22:58:13 +01004362 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004363 /* first surrogate */
4364 surrogate = outCh;
4365 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004366 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004367 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
4368 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004369 }
4370 }
4371 }
4372 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004373 inShift = 0;
4374 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004375 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004376 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4377 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004378 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004379 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004380 if (base64bits > 0) { /* left-over bits */
4381 if (base64bits >= 6) {
4382 /* We've seen at least one base-64 character */
4383 errmsg = "partial character in shift sequence";
4384 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004385 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004386 else {
4387 /* Some bits remain; they should be zero */
4388 if (base64buffer != 0) {
4389 errmsg = "non-zero padding bits in shift sequence";
4390 goto utf7Error;
4391 }
4392 }
4393 }
4394 if (ch != '-') {
4395 /* '-' is absorbed; other terminating
4396 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004397 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4398 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004399 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004400 }
4401 }
4402 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004403 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004404 s++; /* consume '+' */
4405 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004406 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004407 if (unicode_putchar(&unicode, &outpos, '+') < 0)
4408 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004409 }
4410 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004411 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004412 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004413 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004414 }
4415 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004416 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004417 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4418 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004419 s++;
4420 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004421 else {
4422 startinpos = s-starts;
4423 s++;
4424 errmsg = "unexpected special character";
4425 goto utf7Error;
4426 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004427 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004428utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004429 endinpos = s-starts;
4430 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00004431 errors, &errorHandler,
4432 "utf7", errmsg,
4433 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004434 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004435 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004436 }
4437
Antoine Pitrou244651a2009-05-04 18:56:13 +00004438 /* end of string */
4439
4440 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4441 /* if we're in an inconsistent state, that's an error */
4442 if (surrogate ||
4443 (base64bits >= 6) ||
4444 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004445 endinpos = size;
4446 if (unicode_decode_call_errorhandler(
4447 errors, &errorHandler,
4448 "utf7", "unterminated shift sequence",
4449 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004450 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004451 goto onError;
4452 if (s < e)
4453 goto restart;
4454 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004455 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004456
4457 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004458 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004459 if (inShift) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004460 outpos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004461 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004462 }
4463 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004464 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004465 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004466 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004467
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004468 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004469 goto onError;
4470
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004471 Py_XDECREF(errorHandler);
4472 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004473 return unicode_result(unicode);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004474
Benjamin Peterson29060642009-01-31 22:14:21 +00004475 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004476 Py_XDECREF(errorHandler);
4477 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004478 Py_DECREF(unicode);
4479 return NULL;
4480}
4481
4482
Alexander Belopolsky40018472011-02-26 01:02:56 +00004483PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004484_PyUnicode_EncodeUTF7(PyObject *str,
4485 int base64SetO,
4486 int base64WhiteSpace,
4487 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004488{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004489 int kind;
4490 void *data;
4491 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004492 PyObject *v;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004493 Py_ssize_t allocated;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004494 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004495 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004496 unsigned int base64bits = 0;
4497 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004498 char * out;
4499 char * start;
4500
Benjamin Petersonbac79492012-01-14 13:34:47 -05004501 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004502 return NULL;
4503 kind = PyUnicode_KIND(str);
4504 data = PyUnicode_DATA(str);
4505 len = PyUnicode_GET_LENGTH(str);
4506
4507 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004508 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004509
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004510 /* It might be possible to tighten this worst case */
4511 allocated = 8 * len;
4512 if (allocated / 8 != len)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004513 return PyErr_NoMemory();
4514
Antoine Pitrou244651a2009-05-04 18:56:13 +00004515 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004516 if (v == NULL)
4517 return NULL;
4518
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004519 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004520 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004521 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004522
Antoine Pitrou244651a2009-05-04 18:56:13 +00004523 if (inShift) {
4524 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4525 /* shifting out */
4526 if (base64bits) { /* output remaining bits */
4527 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4528 base64buffer = 0;
4529 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004530 }
4531 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004532 /* Characters not in the BASE64 set implicitly unshift the sequence
4533 so no '-' is required, except if the character is itself a '-' */
4534 if (IS_BASE64(ch) || ch == '-') {
4535 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004536 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004537 *out++ = (char) ch;
4538 }
4539 else {
4540 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004541 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004542 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004543 else { /* not in a shift sequence */
4544 if (ch == '+') {
4545 *out++ = '+';
4546 *out++ = '-';
4547 }
4548 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4549 *out++ = (char) ch;
4550 }
4551 else {
4552 *out++ = '+';
4553 inShift = 1;
4554 goto encode_char;
4555 }
4556 }
4557 continue;
4558encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004559 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004560 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004561
Antoine Pitrou244651a2009-05-04 18:56:13 +00004562 /* code first surrogate */
4563 base64bits += 16;
4564 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4565 while (base64bits >= 6) {
4566 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4567 base64bits -= 6;
4568 }
4569 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004570 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004571 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004572 base64bits += 16;
4573 base64buffer = (base64buffer << 16) | ch;
4574 while (base64bits >= 6) {
4575 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4576 base64bits -= 6;
4577 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004578 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004579 if (base64bits)
4580 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4581 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004582 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004583 if (_PyBytes_Resize(&v, out - start) < 0)
4584 return NULL;
4585 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004586}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004587PyObject *
4588PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4589 Py_ssize_t size,
4590 int base64SetO,
4591 int base64WhiteSpace,
4592 const char *errors)
4593{
4594 PyObject *result;
4595 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4596 if (tmp == NULL)
4597 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004598 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004599 base64WhiteSpace, errors);
4600 Py_DECREF(tmp);
4601 return result;
4602}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004603
Antoine Pitrou244651a2009-05-04 18:56:13 +00004604#undef IS_BASE64
4605#undef FROM_BASE64
4606#undef TO_BASE64
4607#undef DECODE_DIRECT
4608#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004609
Guido van Rossumd57fd912000-03-10 22:53:23 +00004610/* --- UTF-8 Codec -------------------------------------------------------- */
4611
Alexander Belopolsky40018472011-02-26 01:02:56 +00004612PyObject *
4613PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004614 Py_ssize_t size,
4615 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004616{
Walter Dörwald69652032004-09-07 20:24:22 +00004617 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4618}
4619
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004620#include "stringlib/asciilib.h"
4621#include "stringlib/codecs.h"
4622#include "stringlib/undef.h"
4623
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004624#include "stringlib/ucs1lib.h"
4625#include "stringlib/codecs.h"
4626#include "stringlib/undef.h"
4627
4628#include "stringlib/ucs2lib.h"
4629#include "stringlib/codecs.h"
4630#include "stringlib/undef.h"
4631
4632#include "stringlib/ucs4lib.h"
4633#include "stringlib/codecs.h"
4634#include "stringlib/undef.h"
4635
Antoine Pitrouab868312009-01-10 15:40:25 +00004636/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4637#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4638
4639/* Mask to quickly check whether a C 'long' contains a
4640 non-ASCII, UTF8-encoded char. */
4641#if (SIZEOF_LONG == 8)
4642# define ASCII_CHAR_MASK 0x8080808080808080L
4643#elif (SIZEOF_LONG == 4)
4644# define ASCII_CHAR_MASK 0x80808080L
4645#else
4646# error C 'long' size should be either 4 or 8!
4647#endif
4648
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004649static Py_ssize_t
4650ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004651{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004652 const char *p = start;
4653 const char *aligned_end = (const char *) ((size_t) end & ~LONG_PTR_MASK);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004654
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004655#if SIZEOF_LONG <= SIZEOF_VOID_P
4656 assert(!((size_t) dest & LONG_PTR_MASK));
4657 if (!((size_t) p & LONG_PTR_MASK)) {
4658 /* Fast path, see in STRINGLIB(utf8_decode) for
4659 an explanation. */
4660 /* Help register allocation */
4661 register const char *_p = p;
4662 register Py_UCS1 * q = dest;
4663 while (_p < aligned_end) {
4664 unsigned long value = *(const unsigned long *) _p;
4665 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004666 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004667 *((unsigned long *)q) = value;
4668 _p += SIZEOF_LONG;
4669 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004670 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004671 p = _p;
4672 while (p < end) {
4673 if ((unsigned char)*p & 0x80)
4674 break;
4675 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004676 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004677 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004678 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004679#endif
4680 while (p < end) {
4681 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4682 for an explanation. */
4683 if (!((size_t) p & LONG_PTR_MASK)) {
4684 /* Help register allocation */
4685 register const char *_p = p;
4686 while (_p < aligned_end) {
4687 unsigned long value = *(unsigned long *) _p;
4688 if (value & ASCII_CHAR_MASK)
4689 break;
4690 _p += SIZEOF_LONG;
4691 }
4692 p = _p;
4693 if (_p == end)
4694 break;
4695 }
4696 if ((unsigned char)*p & 0x80)
4697 break;
4698 ++p;
4699 }
4700 memcpy(dest, start, p - start);
4701 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004702}
Antoine Pitrouab868312009-01-10 15:40:25 +00004703
Victor Stinner785938e2011-12-11 20:09:03 +01004704PyObject *
4705PyUnicode_DecodeUTF8Stateful(const char *s,
4706 Py_ssize_t size,
4707 const char *errors,
4708 Py_ssize_t *consumed)
4709{
Victor Stinner785938e2011-12-11 20:09:03 +01004710 PyObject *unicode;
Victor Stinner785938e2011-12-11 20:09:03 +01004711 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004712 const char *end = s + size;
4713 Py_ssize_t outpos;
4714
4715 Py_ssize_t startinpos;
4716 Py_ssize_t endinpos;
4717 const char *errmsg = "";
4718 PyObject *errorHandler = NULL;
4719 PyObject *exc = NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004720
4721 if (size == 0) {
4722 if (consumed)
4723 *consumed = 0;
Victor Stinner382955f2011-12-11 21:44:00 +01004724 Py_INCREF(unicode_empty);
4725 return unicode_empty;
Victor Stinner785938e2011-12-11 20:09:03 +01004726 }
4727
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004728 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4729 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004730 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004731 *consumed = 1;
4732 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004733 }
4734
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004735 unicode = PyUnicode_New(size, 127);
Victor Stinner785938e2011-12-11 20:09:03 +01004736 if (!unicode)
4737 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004738
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004739 outpos = ascii_decode(s, end, PyUnicode_1BYTE_DATA(unicode));
4740 s += outpos;
4741 while (s < end) {
4742 Py_UCS4 ch;
4743 int kind = PyUnicode_KIND(unicode);
4744 if (kind == PyUnicode_1BYTE_KIND) {
4745 if (PyUnicode_IS_ASCII(unicode))
4746 ch = asciilib_utf8_decode(&s, end,
4747 PyUnicode_1BYTE_DATA(unicode), &outpos);
4748 else
4749 ch = ucs1lib_utf8_decode(&s, end,
4750 PyUnicode_1BYTE_DATA(unicode), &outpos);
4751 } else if (kind == PyUnicode_2BYTE_KIND) {
4752 ch = ucs2lib_utf8_decode(&s, end,
4753 PyUnicode_2BYTE_DATA(unicode), &outpos);
4754 } else {
4755 assert(kind == PyUnicode_4BYTE_KIND);
4756 ch = ucs4lib_utf8_decode(&s, end,
4757 PyUnicode_4BYTE_DATA(unicode), &outpos);
4758 }
4759
4760 switch (ch) {
4761 case 0:
4762 if (s == end || consumed)
4763 goto End;
4764 errmsg = "unexpected end of data";
4765 startinpos = s - starts;
4766 endinpos = startinpos + 1;
4767 while (endinpos < size && (starts[endinpos] & 0xC0) == 0x80)
4768 endinpos++;
4769 break;
4770 case 1:
4771 errmsg = "invalid start byte";
4772 startinpos = s - starts;
4773 endinpos = startinpos + 1;
4774 break;
4775 case 2:
4776 errmsg = "invalid continuation byte";
4777 startinpos = s - starts;
4778 endinpos = startinpos + 1;
4779 while (endinpos < size && (starts[endinpos] & 0xC0) == 0x80)
4780 endinpos++;
4781 break;
4782 default:
4783 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4784 goto onError;
4785 continue;
4786 }
4787
4788 if (unicode_decode_call_errorhandler(
4789 errors, &errorHandler,
4790 "utf-8", errmsg,
4791 &starts, &end, &startinpos, &endinpos, &exc, &s,
4792 &unicode, &outpos))
4793 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004794 }
4795
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004796End:
4797 if (unicode_resize(&unicode, outpos) < 0)
4798 goto onError;
4799
4800 if (consumed)
4801 *consumed = s - starts;
4802
4803 Py_XDECREF(errorHandler);
4804 Py_XDECREF(exc);
4805 assert(_PyUnicode_CheckConsistency(unicode, 1));
4806 return unicode;
4807
4808onError:
4809 Py_XDECREF(errorHandler);
4810 Py_XDECREF(exc);
4811 Py_XDECREF(unicode);
4812 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004813}
4814
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004815#ifdef __APPLE__
4816
4817/* Simplified UTF-8 decoder using surrogateescape error handler,
4818 used to decode the command line arguments on Mac OS X. */
4819
4820wchar_t*
4821_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4822{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004823 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004824 wchar_t *unicode;
4825 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004826
4827 /* Note: size will always be longer than the resulting Unicode
4828 character count */
4829 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4830 PyErr_NoMemory();
4831 return NULL;
4832 }
4833 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4834 if (!unicode)
4835 return NULL;
4836
4837 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004838 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004839 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004840 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004841 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004842#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004843 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004844#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004845 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004846#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004847 if (ch > 0xFF) {
4848#if SIZEOF_WCHAR_T == 4
4849 assert(0);
4850#else
4851 assert(Py_UNICODE_IS_SURROGATE(ch));
4852 /* compute and append the two surrogates: */
4853 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4854 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4855#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004856 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004857 else {
4858 if (!ch && s == e)
4859 break;
4860 /* surrogateescape */
4861 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4862 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004863 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004864 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004865 return unicode;
4866}
4867
4868#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004869
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004870/* Primary internal function which creates utf8 encoded bytes objects.
4871
4872 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004873 and allocate exactly as much space needed at the end. Else allocate the
4874 maximum possible needed (4 result bytes per Unicode character), and return
4875 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004876*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004877PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004878_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004879{
Victor Stinner6099a032011-12-18 14:22:26 +01004880 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004881 void *data;
4882 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004883
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004884 if (!PyUnicode_Check(unicode)) {
4885 PyErr_BadArgument();
4886 return NULL;
4887 }
4888
4889 if (PyUnicode_READY(unicode) == -1)
4890 return NULL;
4891
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004892 if (PyUnicode_UTF8(unicode))
4893 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4894 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004895
4896 kind = PyUnicode_KIND(unicode);
4897 data = PyUnicode_DATA(unicode);
4898 size = PyUnicode_GET_LENGTH(unicode);
4899
Benjamin Petersonead6b532011-12-20 17:23:42 -06004900 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01004901 default:
4902 assert(0);
4903 case PyUnicode_1BYTE_KIND:
4904 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4905 assert(!PyUnicode_IS_ASCII(unicode));
4906 return ucs1lib_utf8_encoder(unicode, data, size, errors);
4907 case PyUnicode_2BYTE_KIND:
4908 return ucs2lib_utf8_encoder(unicode, data, size, errors);
4909 case PyUnicode_4BYTE_KIND:
4910 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00004911 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004912}
4913
Alexander Belopolsky40018472011-02-26 01:02:56 +00004914PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004915PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4916 Py_ssize_t size,
4917 const char *errors)
4918{
4919 PyObject *v, *unicode;
4920
4921 unicode = PyUnicode_FromUnicode(s, size);
4922 if (unicode == NULL)
4923 return NULL;
4924 v = _PyUnicode_AsUTF8String(unicode, errors);
4925 Py_DECREF(unicode);
4926 return v;
4927}
4928
4929PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004930PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004931{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004932 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004933}
4934
Walter Dörwald41980ca2007-08-16 21:55:45 +00004935/* --- UTF-32 Codec ------------------------------------------------------- */
4936
4937PyObject *
4938PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004939 Py_ssize_t size,
4940 const char *errors,
4941 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004942{
4943 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4944}
4945
4946PyObject *
4947PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004948 Py_ssize_t size,
4949 const char *errors,
4950 int *byteorder,
4951 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004952{
4953 const char *starts = s;
4954 Py_ssize_t startinpos;
4955 Py_ssize_t endinpos;
4956 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004957 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004958 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004959 int bo = 0; /* assume native ordering by default */
4960 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004961 /* Offsets from q for retrieving bytes in the right order. */
4962#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4963 int iorder[] = {0, 1, 2, 3};
4964#else
4965 int iorder[] = {3, 2, 1, 0};
4966#endif
4967 PyObject *errorHandler = NULL;
4968 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004969
Walter Dörwald41980ca2007-08-16 21:55:45 +00004970 q = (unsigned char *)s;
4971 e = q + size;
4972
4973 if (byteorder)
4974 bo = *byteorder;
4975
4976 /* Check for BOM marks (U+FEFF) in the input and adjust current
4977 byte order setting accordingly. In native mode, the leading BOM
4978 mark is skipped, in all other modes, it is copied to the output
4979 stream as-is (giving a ZWNBSP character). */
4980 if (bo == 0) {
4981 if (size >= 4) {
4982 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004983 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004984#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004985 if (bom == 0x0000FEFF) {
4986 q += 4;
4987 bo = -1;
4988 }
4989 else if (bom == 0xFFFE0000) {
4990 q += 4;
4991 bo = 1;
4992 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004993#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004994 if (bom == 0x0000FEFF) {
4995 q += 4;
4996 bo = 1;
4997 }
4998 else if (bom == 0xFFFE0000) {
4999 q += 4;
5000 bo = -1;
5001 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005002#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005003 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005004 }
5005
5006 if (bo == -1) {
5007 /* force LE */
5008 iorder[0] = 0;
5009 iorder[1] = 1;
5010 iorder[2] = 2;
5011 iorder[3] = 3;
5012 }
5013 else if (bo == 1) {
5014 /* force BE */
5015 iorder[0] = 3;
5016 iorder[1] = 2;
5017 iorder[2] = 1;
5018 iorder[3] = 0;
5019 }
5020
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005021 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005022 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005023 if (!unicode)
5024 return NULL;
5025 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005026 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005027 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005028
Walter Dörwald41980ca2007-08-16 21:55:45 +00005029 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005030 Py_UCS4 ch;
5031 /* remaining bytes at the end? (size should be divisible by 4) */
5032 if (e-q<4) {
5033 if (consumed)
5034 break;
5035 errmsg = "truncated data";
5036 startinpos = ((const char *)q)-starts;
5037 endinpos = ((const char *)e)-starts;
5038 goto utf32Error;
5039 /* The remaining input chars are ignored if the callback
5040 chooses to skip the input */
5041 }
5042 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5043 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005044
Benjamin Peterson29060642009-01-31 22:14:21 +00005045 if (ch >= 0x110000)
5046 {
5047 errmsg = "codepoint not in range(0x110000)";
5048 startinpos = ((const char *)q)-starts;
5049 endinpos = startinpos+4;
5050 goto utf32Error;
5051 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005052 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5053 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005054 q += 4;
5055 continue;
5056 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005057 if (unicode_decode_call_errorhandler(
5058 errors, &errorHandler,
5059 "utf32", errmsg,
5060 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005061 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005062 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005063 }
5064
5065 if (byteorder)
5066 *byteorder = bo;
5067
5068 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005069 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005070
5071 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005072 if (unicode_resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005073 goto onError;
5074
5075 Py_XDECREF(errorHandler);
5076 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005077 return unicode_result(unicode);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005078
Benjamin Peterson29060642009-01-31 22:14:21 +00005079 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005080 Py_DECREF(unicode);
5081 Py_XDECREF(errorHandler);
5082 Py_XDECREF(exc);
5083 return NULL;
5084}
5085
5086PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005087_PyUnicode_EncodeUTF32(PyObject *str,
5088 const char *errors,
5089 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005090{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005091 int kind;
5092 void *data;
5093 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005094 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005095 unsigned char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005096 Py_ssize_t nsize, bytesize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005097 /* Offsets from p for storing byte pairs in the right order. */
5098#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5099 int iorder[] = {0, 1, 2, 3};
5100#else
5101 int iorder[] = {3, 2, 1, 0};
5102#endif
5103
Benjamin Peterson29060642009-01-31 22:14:21 +00005104#define STORECHAR(CH) \
5105 do { \
5106 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5107 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5108 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5109 p[iorder[0]] = (CH) & 0xff; \
5110 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005111 } while(0)
5112
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005113 if (!PyUnicode_Check(str)) {
5114 PyErr_BadArgument();
5115 return NULL;
5116 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005117 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005118 return NULL;
5119 kind = PyUnicode_KIND(str);
5120 data = PyUnicode_DATA(str);
5121 len = PyUnicode_GET_LENGTH(str);
5122
5123 nsize = len + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005124 bytesize = nsize * 4;
5125 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005126 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005127 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005128 if (v == NULL)
5129 return NULL;
5130
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005131 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005132 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005133 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005134 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005135 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005136
5137 if (byteorder == -1) {
5138 /* force LE */
5139 iorder[0] = 0;
5140 iorder[1] = 1;
5141 iorder[2] = 2;
5142 iorder[3] = 3;
5143 }
5144 else if (byteorder == 1) {
5145 /* force BE */
5146 iorder[0] = 3;
5147 iorder[1] = 2;
5148 iorder[2] = 1;
5149 iorder[3] = 0;
5150 }
5151
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005152 for (i = 0; i < len; i++)
5153 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005154
5155 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005156 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005157#undef STORECHAR
5158}
5159
Alexander Belopolsky40018472011-02-26 01:02:56 +00005160PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005161PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5162 Py_ssize_t size,
5163 const char *errors,
5164 int byteorder)
5165{
5166 PyObject *result;
5167 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5168 if (tmp == NULL)
5169 return NULL;
5170 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5171 Py_DECREF(tmp);
5172 return result;
5173}
5174
5175PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005176PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005177{
Victor Stinnerb960b342011-11-20 19:12:52 +01005178 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005179}
5180
Guido van Rossumd57fd912000-03-10 22:53:23 +00005181/* --- UTF-16 Codec ------------------------------------------------------- */
5182
Tim Peters772747b2001-08-09 22:21:55 +00005183PyObject *
5184PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005185 Py_ssize_t size,
5186 const char *errors,
5187 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005188{
Walter Dörwald69652032004-09-07 20:24:22 +00005189 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5190}
5191
5192PyObject *
5193PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005194 Py_ssize_t size,
5195 const char *errors,
5196 int *byteorder,
5197 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005198{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005199 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005200 Py_ssize_t startinpos;
5201 Py_ssize_t endinpos;
5202 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005203 PyObject *unicode;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005204 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005205 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005206 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005207 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005208 PyObject *errorHandler = NULL;
5209 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005210
Tim Peters772747b2001-08-09 22:21:55 +00005211 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005212 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005213
5214 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005215 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005216
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005217 /* Check for BOM marks (U+FEFF) in the input and adjust current
5218 byte order setting accordingly. In native mode, the leading BOM
5219 mark is skipped, in all other modes, it is copied to the output
5220 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005221 if (bo == 0 && size >= 2) {
5222 const Py_UCS4 bom = (q[1] << 8) | q[0];
5223 if (bom == 0xFEFF) {
5224 q += 2;
5225 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005226 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005227 else if (bom == 0xFFFE) {
5228 q += 2;
5229 bo = 1;
5230 }
5231 if (byteorder)
5232 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005233 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005234
Antoine Pitrou63065d72012-05-15 23:48:04 +02005235 if (q == e) {
5236 if (consumed)
5237 *consumed = size;
5238 Py_INCREF(unicode_empty);
5239 return unicode_empty;
Tim Peters772747b2001-08-09 22:21:55 +00005240 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005241
Antoine Pitrouab868312009-01-10 15:40:25 +00005242#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005243 native_ordering = bo <= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005244#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005245 native_ordering = bo >= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005246#endif
Tim Peters772747b2001-08-09 22:21:55 +00005247
Antoine Pitrou63065d72012-05-15 23:48:04 +02005248 /* Note: size will always be longer than the resulting Unicode
5249 character count */
5250 unicode = PyUnicode_New((e - q + 1) / 2, 127);
5251 if (!unicode)
5252 return NULL;
5253
5254 outpos = 0;
5255 while (1) {
5256 Py_UCS4 ch = 0;
5257 if (e - q >= 2) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005258 int kind = PyUnicode_KIND(unicode);
Antoine Pitrou63065d72012-05-15 23:48:04 +02005259 if (kind == PyUnicode_1BYTE_KIND) {
5260 if (PyUnicode_IS_ASCII(unicode))
5261 ch = asciilib_utf16_decode(&q, e,
5262 PyUnicode_1BYTE_DATA(unicode), &outpos,
5263 native_ordering);
5264 else
5265 ch = ucs1lib_utf16_decode(&q, e,
5266 PyUnicode_1BYTE_DATA(unicode), &outpos,
5267 native_ordering);
5268 } else if (kind == PyUnicode_2BYTE_KIND) {
5269 ch = ucs2lib_utf16_decode(&q, e,
5270 PyUnicode_2BYTE_DATA(unicode), &outpos,
5271 native_ordering);
5272 } else {
5273 assert(kind == PyUnicode_4BYTE_KIND);
5274 ch = ucs4lib_utf16_decode(&q, e,
5275 PyUnicode_4BYTE_DATA(unicode), &outpos,
5276 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005277 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005278 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005279
Antoine Pitrou63065d72012-05-15 23:48:04 +02005280 switch (ch)
5281 {
5282 case 0:
5283 /* remaining byte at the end? (size should be even) */
5284 if (q == e || consumed)
5285 goto End;
5286 errmsg = "truncated data";
5287 startinpos = ((const char *)q) - starts;
5288 endinpos = ((const char *)e) - starts;
5289 break;
5290 /* The remaining input chars are ignored if the callback
5291 chooses to skip the input */
5292 case 1:
5293 errmsg = "unexpected end of data";
5294 startinpos = ((const char *)q) - 2 - starts;
5295 endinpos = ((const char *)e) - starts;
5296 break;
5297 case 2:
5298 errmsg = "illegal encoding";
5299 startinpos = ((const char *)q) - 2 - starts;
5300 endinpos = startinpos + 2;
5301 break;
5302 case 3:
5303 errmsg = "illegal UTF-16 surrogate";
5304 startinpos = ((const char *)q) - 4 - starts;
5305 endinpos = startinpos + 2;
5306 break;
5307 default:
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005308 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5309 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005310 continue;
5311 }
5312
Benjamin Peterson29060642009-01-31 22:14:21 +00005313 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005314 errors,
5315 &errorHandler,
5316 "utf16", errmsg,
5317 &starts,
5318 (const char **)&e,
5319 &startinpos,
5320 &endinpos,
5321 &exc,
5322 (const char **)&q,
5323 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005324 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005325 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005326 }
5327
Antoine Pitrou63065d72012-05-15 23:48:04 +02005328End:
Walter Dörwald69652032004-09-07 20:24:22 +00005329 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005330 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005331
Guido van Rossumd57fd912000-03-10 22:53:23 +00005332 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005333 if (unicode_resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005334 goto onError;
5335
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005336 Py_XDECREF(errorHandler);
5337 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005338 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005339
Benjamin Peterson29060642009-01-31 22:14:21 +00005340 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005341 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005342 Py_XDECREF(errorHandler);
5343 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005344 return NULL;
5345}
5346
Tim Peters772747b2001-08-09 22:21:55 +00005347PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005348_PyUnicode_EncodeUTF16(PyObject *str,
5349 const char *errors,
5350 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005351{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005352 enum PyUnicode_Kind kind;
5353 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005354 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005355 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005356 unsigned short *out;
5357 Py_ssize_t bytesize;
5358 Py_ssize_t pairs;
5359#ifdef WORDS_BIGENDIAN
5360 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005361#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005362 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005363#endif
5364
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005365 if (!PyUnicode_Check(str)) {
5366 PyErr_BadArgument();
5367 return NULL;
5368 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005369 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005370 return NULL;
5371 kind = PyUnicode_KIND(str);
5372 data = PyUnicode_DATA(str);
5373 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005374
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005375 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005376 if (kind == PyUnicode_4BYTE_KIND) {
5377 const Py_UCS4 *in = (const Py_UCS4 *)data;
5378 const Py_UCS4 *end = in + len;
5379 while (in < end)
5380 if (*in++ >= 0x10000)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005381 pairs++;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005382 }
5383 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005384 return PyErr_NoMemory();
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005385 bytesize = (len + pairs + (byteorder == 0)) * 2;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005386 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005387 if (v == NULL)
5388 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005389
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005390 /* output buffer is 2-bytes aligned */
5391 assert(((Py_uintptr_t)PyBytes_AS_STRING(v) & 1) == 0);
5392 out = (unsigned short *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005393 if (byteorder == 0)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005394 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005395 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005396 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005397
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005398 switch (kind) {
5399 case PyUnicode_1BYTE_KIND: {
5400 ucs1lib_utf16_encode(out, (const Py_UCS1 *)data, len, native_ordering);
5401 break;
Tim Peters772747b2001-08-09 22:21:55 +00005402 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005403 case PyUnicode_2BYTE_KIND: {
5404 ucs2lib_utf16_encode(out, (const Py_UCS2 *)data, len, native_ordering);
5405 break;
Tim Peters772747b2001-08-09 22:21:55 +00005406 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005407 case PyUnicode_4BYTE_KIND: {
5408 ucs4lib_utf16_encode(out, (const Py_UCS4 *)data, len, native_ordering);
5409 break;
5410 }
5411 default:
5412 assert(0);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005413 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005414
5415 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005416 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005417}
5418
Alexander Belopolsky40018472011-02-26 01:02:56 +00005419PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005420PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5421 Py_ssize_t size,
5422 const char *errors,
5423 int byteorder)
5424{
5425 PyObject *result;
5426 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5427 if (tmp == NULL)
5428 return NULL;
5429 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5430 Py_DECREF(tmp);
5431 return result;
5432}
5433
5434PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005435PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005436{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005437 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005438}
5439
5440/* --- Unicode Escape Codec ----------------------------------------------- */
5441
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005442/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5443 if all the escapes in the string make it still a valid ASCII string.
5444 Returns -1 if any escapes were found which cause the string to
5445 pop out of ASCII range. Otherwise returns the length of the
5446 required buffer to hold the string.
5447 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005448static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005449length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5450{
5451 const unsigned char *p = (const unsigned char *)s;
5452 const unsigned char *end = p + size;
5453 Py_ssize_t length = 0;
5454
5455 if (size < 0)
5456 return -1;
5457
5458 for (; p < end; ++p) {
5459 if (*p > 127) {
5460 /* Non-ASCII */
5461 return -1;
5462 }
5463 else if (*p != '\\') {
5464 /* Normal character */
5465 ++length;
5466 }
5467 else {
5468 /* Backslash-escape, check next char */
5469 ++p;
5470 /* Escape sequence reaches till end of string or
5471 non-ASCII follow-up. */
5472 if (p >= end || *p > 127)
5473 return -1;
5474 switch (*p) {
5475 case '\n':
5476 /* backslash + \n result in zero characters */
5477 break;
5478 case '\\': case '\'': case '\"':
5479 case 'b': case 'f': case 't':
5480 case 'n': case 'r': case 'v': case 'a':
5481 ++length;
5482 break;
5483 case '0': case '1': case '2': case '3':
5484 case '4': case '5': case '6': case '7':
5485 case 'x': case 'u': case 'U': case 'N':
5486 /* these do not guarantee ASCII characters */
5487 return -1;
5488 default:
5489 /* count the backslash + the other character */
5490 length += 2;
5491 }
5492 }
5493 }
5494 return length;
5495}
5496
Fredrik Lundh06d12682001-01-24 07:59:11 +00005497static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005498
Alexander Belopolsky40018472011-02-26 01:02:56 +00005499PyObject *
5500PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005501 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005502 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005503{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005504 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005505 Py_ssize_t startinpos;
5506 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005507 int j;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005508 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005509 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005510 char* message;
5511 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005512 PyObject *errorHandler = NULL;
5513 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005514 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005515 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005516
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005517 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005518
5519 /* After length_of_escaped_ascii_string() there are two alternatives,
5520 either the string is pure ASCII with named escapes like \n, etc.
5521 and we determined it's exact size (common case)
5522 or it contains \x, \u, ... escape sequences. then we create a
5523 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005524 if (len >= 0) {
5525 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005526 if (!v)
5527 goto onError;
5528 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005529 }
5530 else {
5531 /* Escaped strings will always be longer than the resulting
5532 Unicode string, so we start with size here and then reduce the
5533 length after conversion to the true value.
5534 (but if the error callback returns a long replacement string
5535 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005536 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005537 if (!v)
5538 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005539 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005540 }
5541
Guido van Rossumd57fd912000-03-10 22:53:23 +00005542 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005543 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005544 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005545 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005546
Guido van Rossumd57fd912000-03-10 22:53:23 +00005547 while (s < end) {
5548 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005549 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005550 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005551
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005552 /* The only case in which i == ascii_length is a backslash
5553 followed by a newline. */
5554 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005555
Guido van Rossumd57fd912000-03-10 22:53:23 +00005556 /* Non-escape characters are interpreted as Unicode ordinals */
5557 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005558 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5559 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005560 continue;
5561 }
5562
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005563 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005564 /* \ - Escapes */
5565 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005566 c = *s++;
5567 if (s > end)
5568 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005569
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005570 /* The only case in which i == ascii_length is a backslash
5571 followed by a newline. */
5572 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005573
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005574 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005575
Benjamin Peterson29060642009-01-31 22:14:21 +00005576 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005577#define WRITECHAR(ch) \
5578 do { \
5579 if (unicode_putchar(&v, &i, ch) < 0) \
5580 goto onError; \
5581 }while(0)
5582
Guido van Rossumd57fd912000-03-10 22:53:23 +00005583 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005584 case '\\': WRITECHAR('\\'); break;
5585 case '\'': WRITECHAR('\''); break;
5586 case '\"': WRITECHAR('\"'); break;
5587 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005588 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005589 case 'f': WRITECHAR('\014'); break;
5590 case 't': WRITECHAR('\t'); break;
5591 case 'n': WRITECHAR('\n'); break;
5592 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005593 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005594 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005595 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005596 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005597
Benjamin Peterson29060642009-01-31 22:14:21 +00005598 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005599 case '0': case '1': case '2': case '3':
5600 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005601 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005602 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005603 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005604 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005605 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005606 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005607 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005608 break;
5609
Benjamin Peterson29060642009-01-31 22:14:21 +00005610 /* hex escapes */
5611 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005612 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005613 digits = 2;
5614 message = "truncated \\xXX escape";
5615 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005616
Benjamin Peterson29060642009-01-31 22:14:21 +00005617 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005618 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005619 digits = 4;
5620 message = "truncated \\uXXXX escape";
5621 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005622
Benjamin Peterson29060642009-01-31 22:14:21 +00005623 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005624 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005625 digits = 8;
5626 message = "truncated \\UXXXXXXXX escape";
5627 hexescape:
5628 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005629 if (s+digits>end) {
5630 endinpos = size;
5631 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005632 errors, &errorHandler,
5633 "unicodeescape", "end of string in escape sequence",
5634 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005635 &v, &i))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005636 goto onError;
5637 goto nextByte;
5638 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005639 for (j = 0; j < digits; ++j) {
5640 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005641 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005642 endinpos = (s+j+1)-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005643 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005644 errors, &errorHandler,
5645 "unicodeescape", message,
5646 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005647 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005648 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005649 len = PyUnicode_GET_LENGTH(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005650 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005651 }
5652 chr = (chr<<4) & ~0xF;
5653 if (c >= '0' && c <= '9')
5654 chr += c - '0';
5655 else if (c >= 'a' && c <= 'f')
5656 chr += 10 + c - 'a';
5657 else
5658 chr += 10 + c - 'A';
5659 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005660 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005661 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005662 /* _decoding_error will have already written into the
5663 target buffer. */
5664 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005665 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005666 /* when we get here, chr is a 32-bit unicode character */
Victor Stinner8faf8212011-12-08 22:14:11 +01005667 if (chr <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005668 WRITECHAR(chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005669 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005670 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005671 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005672 errors, &errorHandler,
5673 "unicodeescape", "illegal Unicode character",
5674 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005675 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005676 goto onError;
5677 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005678 break;
5679
Benjamin Peterson29060642009-01-31 22:14:21 +00005680 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005681 case 'N':
5682 message = "malformed \\N character escape";
5683 if (ucnhash_CAPI == NULL) {
5684 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005685 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5686 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005687 if (ucnhash_CAPI == NULL)
5688 goto ucnhashError;
5689 }
5690 if (*s == '{') {
5691 const char *start = s+1;
5692 /* look for the closing brace */
5693 while (*s != '}' && s < end)
5694 s++;
5695 if (s > start && s < end && *s == '}') {
5696 /* found a name. look it up in the unicode database */
5697 message = "unknown Unicode character name";
5698 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005699 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005700 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005701 goto store;
5702 }
5703 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005704 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005705 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005706 errors, &errorHandler,
5707 "unicodeescape", message,
5708 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005709 &v, &i))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005710 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005711 break;
5712
5713 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005714 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005715 message = "\\ at end of string";
5716 s--;
5717 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005718 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005719 errors, &errorHandler,
5720 "unicodeescape", message,
5721 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005722 &v, &i))
Walter Dörwald8c077222002-03-25 11:16:18 +00005723 goto onError;
5724 }
5725 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005726 WRITECHAR('\\');
5727 WRITECHAR(s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005728 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005729 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005730 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005731 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005732 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005733 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005734#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005735
Victor Stinner16e6a802011-12-12 13:24:15 +01005736 if (unicode_resize(&v, i) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005737 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005738 Py_XDECREF(errorHandler);
5739 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005740 return unicode_result(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005741
Benjamin Peterson29060642009-01-31 22:14:21 +00005742 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005743 PyErr_SetString(
5744 PyExc_UnicodeError,
5745 "\\N escapes not supported (can't load unicodedata module)"
5746 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005747 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005748 Py_XDECREF(errorHandler);
5749 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005750 return NULL;
5751
Benjamin Peterson29060642009-01-31 22:14:21 +00005752 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005753 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005754 Py_XDECREF(errorHandler);
5755 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005756 return NULL;
5757}
5758
5759/* Return a Unicode-Escape string version of the Unicode object.
5760
5761 If quotes is true, the string is enclosed in u"" or u'' quotes as
5762 appropriate.
5763
5764*/
5765
Alexander Belopolsky40018472011-02-26 01:02:56 +00005766PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005767PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005768{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005769 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005770 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005771 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005772 int kind;
5773 void *data;
5774 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005775
Thomas Wouters89f507f2006-12-13 04:49:30 +00005776 /* Initial allocation is based on the longest-possible unichr
5777 escape.
5778
5779 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5780 unichr, so in this case it's the longest unichr escape. In
5781 narrow (UTF-16) builds this is five chars per source unichr
5782 since there are two unichrs in the surrogate pair, so in narrow
5783 (UTF-16) builds it's not the longest unichr escape.
5784
5785 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5786 so in the narrow (UTF-16) build case it's the longest unichr
5787 escape.
5788 */
5789
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005790 if (!PyUnicode_Check(unicode)) {
5791 PyErr_BadArgument();
5792 return NULL;
5793 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005794 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005795 return NULL;
5796 len = PyUnicode_GET_LENGTH(unicode);
5797 kind = PyUnicode_KIND(unicode);
5798 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06005799 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005800 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5801 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5802 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5803 }
5804
5805 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005806 return PyBytes_FromStringAndSize(NULL, 0);
5807
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005808 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005809 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005810
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005811 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005812 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005813 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005814 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005815 if (repr == NULL)
5816 return NULL;
5817
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005818 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005819
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005820 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005821 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005822
Walter Dörwald79e913e2007-05-12 11:08:06 +00005823 /* Escape backslashes */
5824 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005825 *p++ = '\\';
5826 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005827 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005828 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005829
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005830 /* Map 21-bit characters to '\U00xxxxxx' */
5831 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005832 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005833 *p++ = '\\';
5834 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005835 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5836 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5837 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5838 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5839 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5840 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5841 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5842 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005843 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005844 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005845
Guido van Rossumd57fd912000-03-10 22:53:23 +00005846 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005847 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005848 *p++ = '\\';
5849 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005850 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5851 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5852 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5853 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005854 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005855
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005856 /* Map special whitespace to '\t', \n', '\r' */
5857 else if (ch == '\t') {
5858 *p++ = '\\';
5859 *p++ = 't';
5860 }
5861 else if (ch == '\n') {
5862 *p++ = '\\';
5863 *p++ = 'n';
5864 }
5865 else if (ch == '\r') {
5866 *p++ = '\\';
5867 *p++ = 'r';
5868 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005869
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005870 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005871 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005872 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005873 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005874 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5875 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005876 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005877
Guido van Rossumd57fd912000-03-10 22:53:23 +00005878 /* Copy everything else as-is */
5879 else
5880 *p++ = (char) ch;
5881 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005882
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005883 assert(p - PyBytes_AS_STRING(repr) > 0);
5884 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5885 return NULL;
5886 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005887}
5888
Alexander Belopolsky40018472011-02-26 01:02:56 +00005889PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005890PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5891 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005892{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005893 PyObject *result;
5894 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5895 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005896 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005897 result = PyUnicode_AsUnicodeEscapeString(tmp);
5898 Py_DECREF(tmp);
5899 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005900}
5901
5902/* --- Raw Unicode Escape Codec ------------------------------------------- */
5903
Alexander Belopolsky40018472011-02-26 01:02:56 +00005904PyObject *
5905PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005906 Py_ssize_t size,
5907 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005908{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005909 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005910 Py_ssize_t startinpos;
5911 Py_ssize_t endinpos;
5912 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005913 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005914 const char *end;
5915 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005916 PyObject *errorHandler = NULL;
5917 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005918
Guido van Rossumd57fd912000-03-10 22:53:23 +00005919 /* Escaped strings will always be longer than the resulting
5920 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005921 length after conversion to the true value. (But decoding error
5922 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005923 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005924 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005925 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005926 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005927 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005928 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005929 end = s + size;
5930 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005931 unsigned char c;
5932 Py_UCS4 x;
5933 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005934 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005935
Benjamin Peterson29060642009-01-31 22:14:21 +00005936 /* Non-escape characters are interpreted as Unicode ordinals */
5937 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005938 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
5939 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005940 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005941 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005942 startinpos = s-starts;
5943
5944 /* \u-escapes are only interpreted iff the number of leading
5945 backslashes if odd */
5946 bs = s;
5947 for (;s < end;) {
5948 if (*s != '\\')
5949 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005950 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
5951 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005952 }
5953 if (((s - bs) & 1) == 0 ||
5954 s >= end ||
5955 (*s != 'u' && *s != 'U')) {
5956 continue;
5957 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005958 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00005959 count = *s=='u' ? 4 : 8;
5960 s++;
5961
5962 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00005963 for (x = 0, i = 0; i < count; ++i, ++s) {
5964 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005965 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005966 endinpos = s-starts;
5967 if (unicode_decode_call_errorhandler(
5968 errors, &errorHandler,
5969 "rawunicodeescape", "truncated \\uXXXX",
5970 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005971 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005972 goto onError;
5973 goto nextByte;
5974 }
5975 x = (x<<4) & ~0xF;
5976 if (c >= '0' && c <= '9')
5977 x += c - '0';
5978 else if (c >= 'a' && c <= 'f')
5979 x += 10 + c - 'a';
5980 else
5981 x += 10 + c - 'A';
5982 }
Victor Stinner8faf8212011-12-08 22:14:11 +01005983 if (x <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005984 if (unicode_putchar(&v, &outpos, x) < 0)
5985 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005986 } else {
5987 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005988 if (unicode_decode_call_errorhandler(
5989 errors, &errorHandler,
5990 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005991 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005992 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005993 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005994 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005995 nextByte:
5996 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005997 }
Victor Stinner16e6a802011-12-12 13:24:15 +01005998 if (unicode_resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005999 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006000 Py_XDECREF(errorHandler);
6001 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006002 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00006003
Benjamin Peterson29060642009-01-31 22:14:21 +00006004 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006005 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006006 Py_XDECREF(errorHandler);
6007 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006008 return NULL;
6009}
6010
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006011
Alexander Belopolsky40018472011-02-26 01:02:56 +00006012PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006013PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006014{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006015 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006016 char *p;
6017 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006018 Py_ssize_t expandsize, pos;
6019 int kind;
6020 void *data;
6021 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006022
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006023 if (!PyUnicode_Check(unicode)) {
6024 PyErr_BadArgument();
6025 return NULL;
6026 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006027 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006028 return NULL;
6029 kind = PyUnicode_KIND(unicode);
6030 data = PyUnicode_DATA(unicode);
6031 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006032 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6033 bytes, and 1 byte characters 4. */
6034 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006035
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006036 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006037 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006038
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006039 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006040 if (repr == NULL)
6041 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006042 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006043 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006044
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006045 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006046 for (pos = 0; pos < len; pos++) {
6047 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006048 /* Map 32-bit characters to '\Uxxxxxxxx' */
6049 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006050 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006051 *p++ = '\\';
6052 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006053 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6054 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6055 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6056 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6057 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6058 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6059 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6060 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006061 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006062 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006063 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006064 *p++ = '\\';
6065 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006066 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6067 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6068 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6069 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006070 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006071 /* Copy everything else as-is */
6072 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006073 *p++ = (char) ch;
6074 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006075
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006076 assert(p > q);
6077 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006078 return NULL;
6079 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006080}
6081
Alexander Belopolsky40018472011-02-26 01:02:56 +00006082PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006083PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6084 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006085{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006086 PyObject *result;
6087 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6088 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006089 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006090 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6091 Py_DECREF(tmp);
6092 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006093}
6094
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006095/* --- Unicode Internal Codec ------------------------------------------- */
6096
Alexander Belopolsky40018472011-02-26 01:02:56 +00006097PyObject *
6098_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006099 Py_ssize_t size,
6100 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006101{
6102 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006103 Py_ssize_t startinpos;
6104 Py_ssize_t endinpos;
6105 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006106 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006107 const char *end;
6108 const char *reason;
6109 PyObject *errorHandler = NULL;
6110 PyObject *exc = NULL;
6111
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006112 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006113 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006114 1))
6115 return NULL;
6116
Thomas Wouters89f507f2006-12-13 04:49:30 +00006117 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006118 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006119 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006120 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006121 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006122 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006123 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006124 end = s + size;
6125
6126 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006127 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006128 Py_UCS4 ch;
6129 /* We copy the raw representation one byte at a time because the
6130 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006131 ((char *) &uch)[0] = s[0];
6132 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006133#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006134 ((char *) &uch)[2] = s[2];
6135 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006136#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006137 ch = uch;
6138
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006139 /* We have to sanity check the raw data, otherwise doom looms for
6140 some malformed UCS-4 data. */
6141 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006142#ifdef Py_UNICODE_WIDE
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006143 ch > 0x10ffff ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006144#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006145 end-s < Py_UNICODE_SIZE
6146 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006147 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006148 startinpos = s - starts;
6149 if (end-s < Py_UNICODE_SIZE) {
6150 endinpos = end-starts;
6151 reason = "truncated input";
6152 }
6153 else {
6154 endinpos = s - starts + Py_UNICODE_SIZE;
6155 reason = "illegal code point (> 0x10FFFF)";
6156 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006157 if (unicode_decode_call_errorhandler(
6158 errors, &errorHandler,
6159 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006160 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006161 &v, &outpos))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006162 goto onError;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006163 continue;
6164 }
6165
6166 s += Py_UNICODE_SIZE;
6167#ifndef Py_UNICODE_WIDE
Victor Stinner551ac952011-11-29 22:58:13 +01006168 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006169 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006170 Py_UNICODE uch2;
6171 ((char *) &uch2)[0] = s[0];
6172 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006173 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006174 {
Victor Stinner551ac952011-11-29 22:58:13 +01006175 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006176 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006177 }
6178 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006179#endif
6180
6181 if (unicode_putchar(&v, &outpos, ch) < 0)
6182 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006183 }
6184
Victor Stinner16e6a802011-12-12 13:24:15 +01006185 if (unicode_resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006186 goto onError;
6187 Py_XDECREF(errorHandler);
6188 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006189 return unicode_result(v);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006190
Benjamin Peterson29060642009-01-31 22:14:21 +00006191 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006192 Py_XDECREF(v);
6193 Py_XDECREF(errorHandler);
6194 Py_XDECREF(exc);
6195 return NULL;
6196}
6197
Guido van Rossumd57fd912000-03-10 22:53:23 +00006198/* --- Latin-1 Codec ------------------------------------------------------ */
6199
Alexander Belopolsky40018472011-02-26 01:02:56 +00006200PyObject *
6201PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006202 Py_ssize_t size,
6203 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006204{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006205 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006206 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006207}
6208
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006209/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006210static void
6211make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006212 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006213 PyObject *unicode,
6214 Py_ssize_t startpos, Py_ssize_t endpos,
6215 const char *reason)
6216{
6217 if (*exceptionObject == NULL) {
6218 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006219 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006220 encoding, unicode, startpos, endpos, reason);
6221 }
6222 else {
6223 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6224 goto onError;
6225 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6226 goto onError;
6227 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6228 goto onError;
6229 return;
6230 onError:
6231 Py_DECREF(*exceptionObject);
6232 *exceptionObject = NULL;
6233 }
6234}
6235
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006236/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006237static void
6238raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006239 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006240 PyObject *unicode,
6241 Py_ssize_t startpos, Py_ssize_t endpos,
6242 const char *reason)
6243{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006244 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006245 encoding, unicode, startpos, endpos, reason);
6246 if (*exceptionObject != NULL)
6247 PyCodec_StrictErrors(*exceptionObject);
6248}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006249
6250/* error handling callback helper:
6251 build arguments, call the callback and check the arguments,
6252 put the result into newpos and return the replacement string, which
6253 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006254static PyObject *
6255unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006256 PyObject **errorHandler,
6257 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006258 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006259 Py_ssize_t startpos, Py_ssize_t endpos,
6260 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006261{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006262 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006263 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006264 PyObject *restuple;
6265 PyObject *resunicode;
6266
6267 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006268 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006269 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006270 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006271 }
6272
Benjamin Petersonbac79492012-01-14 13:34:47 -05006273 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006274 return NULL;
6275 len = PyUnicode_GET_LENGTH(unicode);
6276
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006277 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006278 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006279 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006280 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006281
6282 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006283 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006284 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006285 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006286 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006287 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006288 Py_DECREF(restuple);
6289 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006290 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006291 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006292 &resunicode, newpos)) {
6293 Py_DECREF(restuple);
6294 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006295 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006296 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6297 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6298 Py_DECREF(restuple);
6299 return NULL;
6300 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006301 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006302 *newpos = len + *newpos;
6303 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006304 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6305 Py_DECREF(restuple);
6306 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006307 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006308 Py_INCREF(resunicode);
6309 Py_DECREF(restuple);
6310 return resunicode;
6311}
6312
Alexander Belopolsky40018472011-02-26 01:02:56 +00006313static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006314unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006315 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006316 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006317{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006318 /* input state */
6319 Py_ssize_t pos=0, size;
6320 int kind;
6321 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006322 /* output object */
6323 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006324 /* pointer into the output */
6325 char *str;
6326 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006327 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006328 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6329 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006330 PyObject *errorHandler = NULL;
6331 PyObject *exc = NULL;
6332 /* the following variable is used for caching string comparisons
6333 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6334 int known_errorHandler = -1;
6335
Benjamin Petersonbac79492012-01-14 13:34:47 -05006336 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006337 return NULL;
6338 size = PyUnicode_GET_LENGTH(unicode);
6339 kind = PyUnicode_KIND(unicode);
6340 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006341 /* allocate enough for a simple encoding without
6342 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006343 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006344 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006345 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006346 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006347 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006348 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006349 ressize = size;
6350
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006351 while (pos < size) {
6352 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006353
Benjamin Peterson29060642009-01-31 22:14:21 +00006354 /* can we encode this? */
6355 if (c<limit) {
6356 /* no overflow check, because we know that the space is enough */
6357 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006358 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006359 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006360 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006361 Py_ssize_t requiredsize;
6362 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006363 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006364 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006365 Py_ssize_t collstart = pos;
6366 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006367 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006368 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006369 ++collend;
6370 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6371 if (known_errorHandler==-1) {
6372 if ((errors==NULL) || (!strcmp(errors, "strict")))
6373 known_errorHandler = 1;
6374 else if (!strcmp(errors, "replace"))
6375 known_errorHandler = 2;
6376 else if (!strcmp(errors, "ignore"))
6377 known_errorHandler = 3;
6378 else if (!strcmp(errors, "xmlcharrefreplace"))
6379 known_errorHandler = 4;
6380 else
6381 known_errorHandler = 0;
6382 }
6383 switch (known_errorHandler) {
6384 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006385 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006386 goto onError;
6387 case 2: /* replace */
6388 while (collstart++<collend)
6389 *str++ = '?'; /* fall through */
6390 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006391 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006392 break;
6393 case 4: /* xmlcharrefreplace */
6394 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006395 /* determine replacement size */
6396 for (i = collstart, repsize = 0; i < collend; ++i) {
6397 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6398 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006399 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006400 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006401 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006402 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006403 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006404 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006405 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006406 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006407 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006408 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006409 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006410 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006411 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006412 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006413 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006414 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006415 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006416 if (requiredsize > ressize) {
6417 if (requiredsize<2*ressize)
6418 requiredsize = 2*ressize;
6419 if (_PyBytes_Resize(&res, requiredsize))
6420 goto onError;
6421 str = PyBytes_AS_STRING(res) + respos;
6422 ressize = requiredsize;
6423 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006424 /* generate replacement */
6425 for (i = collstart; i < collend; ++i) {
6426 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006427 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006428 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006429 break;
6430 default:
6431 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006432 encoding, reason, unicode, &exc,
6433 collstart, collend, &newpos);
6434 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006435 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006436 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006437 if (PyBytes_Check(repunicode)) {
6438 /* Directly copy bytes result to output. */
6439 repsize = PyBytes_Size(repunicode);
6440 if (repsize > 1) {
6441 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006442 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006443 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6444 Py_DECREF(repunicode);
6445 goto onError;
6446 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006447 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006448 ressize += repsize-1;
6449 }
6450 memcpy(str, PyBytes_AsString(repunicode), repsize);
6451 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006452 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006453 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006454 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006455 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006456 /* need more space? (at least enough for what we
6457 have+the replacement+the rest of the string, so
6458 we won't have to check space for encodable characters) */
6459 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006460 repsize = PyUnicode_GET_LENGTH(repunicode);
6461 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006462 if (requiredsize > ressize) {
6463 if (requiredsize<2*ressize)
6464 requiredsize = 2*ressize;
6465 if (_PyBytes_Resize(&res, requiredsize)) {
6466 Py_DECREF(repunicode);
6467 goto onError;
6468 }
6469 str = PyBytes_AS_STRING(res) + respos;
6470 ressize = requiredsize;
6471 }
6472 /* check if there is anything unencodable in the replacement
6473 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006474 for (i = 0; repsize-->0; ++i, ++str) {
6475 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006476 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006477 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006478 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006479 Py_DECREF(repunicode);
6480 goto onError;
6481 }
6482 *str = (char)c;
6483 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006484 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006485 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006486 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006487 }
6488 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006489 /* Resize if we allocated to much */
6490 size = str - PyBytes_AS_STRING(res);
6491 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006492 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006493 if (_PyBytes_Resize(&res, size) < 0)
6494 goto onError;
6495 }
6496
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006497 Py_XDECREF(errorHandler);
6498 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006499 return res;
6500
6501 onError:
6502 Py_XDECREF(res);
6503 Py_XDECREF(errorHandler);
6504 Py_XDECREF(exc);
6505 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006506}
6507
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006508/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006509PyObject *
6510PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006511 Py_ssize_t size,
6512 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006513{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006514 PyObject *result;
6515 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6516 if (unicode == NULL)
6517 return NULL;
6518 result = unicode_encode_ucs1(unicode, errors, 256);
6519 Py_DECREF(unicode);
6520 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006521}
6522
Alexander Belopolsky40018472011-02-26 01:02:56 +00006523PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006524_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006525{
6526 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006527 PyErr_BadArgument();
6528 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006529 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006530 if (PyUnicode_READY(unicode) == -1)
6531 return NULL;
6532 /* Fast path: if it is a one-byte string, construct
6533 bytes object directly. */
6534 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6535 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6536 PyUnicode_GET_LENGTH(unicode));
6537 /* Non-Latin-1 characters present. Defer to above function to
6538 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006539 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006540}
6541
6542PyObject*
6543PyUnicode_AsLatin1String(PyObject *unicode)
6544{
6545 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006546}
6547
6548/* --- 7-bit ASCII Codec -------------------------------------------------- */
6549
Alexander Belopolsky40018472011-02-26 01:02:56 +00006550PyObject *
6551PyUnicode_DecodeASCII(const char *s,
6552 Py_ssize_t size,
6553 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006554{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006555 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006556 PyObject *unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006557 int kind;
6558 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006559 Py_ssize_t startinpos;
6560 Py_ssize_t endinpos;
6561 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006562 const char *e;
6563 PyObject *errorHandler = NULL;
6564 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006565
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006566 if (size == 0) {
6567 Py_INCREF(unicode_empty);
6568 return unicode_empty;
6569 }
6570
Guido van Rossumd57fd912000-03-10 22:53:23 +00006571 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006572 if (size == 1 && (unsigned char)s[0] < 128)
6573 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006574
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006575 unicode = PyUnicode_New(size, 127);
6576 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006577 goto onError;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006578
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006579 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006580 data = PyUnicode_1BYTE_DATA(unicode);
6581 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
6582 if (outpos == size)
6583 return unicode;
6584
6585 s += outpos;
6586 kind = PyUnicode_1BYTE_KIND;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006587 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006588 register unsigned char c = (unsigned char)*s;
6589 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006590 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006591 ++s;
6592 }
6593 else {
6594 startinpos = s-starts;
6595 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006596 if (unicode_decode_call_errorhandler(
6597 errors, &errorHandler,
6598 "ascii", "ordinal not in range(128)",
6599 &starts, &e, &startinpos, &endinpos, &exc, &s,
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006600 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006601 goto onError;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006602 kind = PyUnicode_KIND(unicode);
6603 data = PyUnicode_DATA(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00006604 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006605 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006606 if (unicode_resize(&unicode, outpos) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006607 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006608 Py_XDECREF(errorHandler);
6609 Py_XDECREF(exc);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006610 assert(_PyUnicode_CheckConsistency(unicode, 1));
6611 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00006612
Benjamin Peterson29060642009-01-31 22:14:21 +00006613 onError:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006614 Py_XDECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006615 Py_XDECREF(errorHandler);
6616 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006617 return NULL;
6618}
6619
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006620/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006621PyObject *
6622PyUnicode_EncodeASCII(const Py_UNICODE *p,
6623 Py_ssize_t size,
6624 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006625{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006626 PyObject *result;
6627 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6628 if (unicode == NULL)
6629 return NULL;
6630 result = unicode_encode_ucs1(unicode, errors, 128);
6631 Py_DECREF(unicode);
6632 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006633}
6634
Alexander Belopolsky40018472011-02-26 01:02:56 +00006635PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006636_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006637{
6638 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006639 PyErr_BadArgument();
6640 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006641 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006642 if (PyUnicode_READY(unicode) == -1)
6643 return NULL;
6644 /* Fast path: if it is an ASCII-only string, construct bytes object
6645 directly. Else defer to above function to raise the exception. */
6646 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6647 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6648 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006649 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006650}
6651
6652PyObject *
6653PyUnicode_AsASCIIString(PyObject *unicode)
6654{
6655 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006656}
6657
Victor Stinner99b95382011-07-04 14:23:54 +02006658#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006659
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006660/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006661
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006662#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006663#define NEED_RETRY
6664#endif
6665
Victor Stinner3a50e702011-10-18 21:21:00 +02006666#ifndef WC_ERR_INVALID_CHARS
6667# define WC_ERR_INVALID_CHARS 0x0080
6668#endif
6669
6670static char*
6671code_page_name(UINT code_page, PyObject **obj)
6672{
6673 *obj = NULL;
6674 if (code_page == CP_ACP)
6675 return "mbcs";
6676 if (code_page == CP_UTF7)
6677 return "CP_UTF7";
6678 if (code_page == CP_UTF8)
6679 return "CP_UTF8";
6680
6681 *obj = PyBytes_FromFormat("cp%u", code_page);
6682 if (*obj == NULL)
6683 return NULL;
6684 return PyBytes_AS_STRING(*obj);
6685}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006686
Alexander Belopolsky40018472011-02-26 01:02:56 +00006687static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006688is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006689{
6690 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006691 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006692
Victor Stinner3a50e702011-10-18 21:21:00 +02006693 if (!IsDBCSLeadByteEx(code_page, *curr))
6694 return 0;
6695
6696 prev = CharPrevExA(code_page, s, curr, 0);
6697 if (prev == curr)
6698 return 1;
6699 /* FIXME: This code is limited to "true" double-byte encodings,
6700 as it assumes an incomplete character consists of a single
6701 byte. */
6702 if (curr - prev == 2)
6703 return 1;
6704 if (!IsDBCSLeadByteEx(code_page, *prev))
6705 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006706 return 0;
6707}
6708
Victor Stinner3a50e702011-10-18 21:21:00 +02006709static DWORD
6710decode_code_page_flags(UINT code_page)
6711{
6712 if (code_page == CP_UTF7) {
6713 /* The CP_UTF7 decoder only supports flags=0 */
6714 return 0;
6715 }
6716 else
6717 return MB_ERR_INVALID_CHARS;
6718}
6719
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006720/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006721 * Decode a byte string from a Windows code page into unicode object in strict
6722 * mode.
6723 *
6724 * Returns consumed size if succeed, returns -2 on decode error, or raise a
6725 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006726 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006727static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006728decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006729 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006730 const char *in,
6731 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006732{
Victor Stinner3a50e702011-10-18 21:21:00 +02006733 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006734 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006735 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006736
6737 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006738 assert(insize > 0);
6739 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6740 if (outsize <= 0)
6741 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006742
6743 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006744 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01006745 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006746 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006747 if (*v == NULL)
6748 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006749 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006750 }
6751 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006752 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006753 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01006754 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006755 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006756 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006757 }
6758
6759 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006760 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6761 if (outsize <= 0)
6762 goto error;
6763 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006764
Victor Stinner3a50e702011-10-18 21:21:00 +02006765error:
6766 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6767 return -2;
6768 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006769 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006770}
6771
Victor Stinner3a50e702011-10-18 21:21:00 +02006772/*
6773 * Decode a byte string from a code page into unicode object with an error
6774 * handler.
6775 *
6776 * Returns consumed size if succeed, or raise a WindowsError or
6777 * UnicodeDecodeError exception and returns -1 on error.
6778 */
6779static int
6780decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006781 PyObject **v,
6782 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006783 const char *errors)
6784{
6785 const char *startin = in;
6786 const char *endin = in + size;
6787 const DWORD flags = decode_code_page_flags(code_page);
6788 /* Ideally, we should get reason from FormatMessage. This is the Windows
6789 2000 English version of the message. */
6790 const char *reason = "No mapping for the Unicode character exists "
6791 "in the target code page.";
6792 /* each step cannot decode more than 1 character, but a character can be
6793 represented as a surrogate pair */
6794 wchar_t buffer[2], *startout, *out;
6795 int insize, outsize;
6796 PyObject *errorHandler = NULL;
6797 PyObject *exc = NULL;
6798 PyObject *encoding_obj = NULL;
6799 char *encoding;
6800 DWORD err;
6801 int ret = -1;
6802
6803 assert(size > 0);
6804
6805 encoding = code_page_name(code_page, &encoding_obj);
6806 if (encoding == NULL)
6807 return -1;
6808
6809 if (errors == NULL || strcmp(errors, "strict") == 0) {
6810 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6811 UnicodeDecodeError. */
6812 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6813 if (exc != NULL) {
6814 PyCodec_StrictErrors(exc);
6815 Py_CLEAR(exc);
6816 }
6817 goto error;
6818 }
6819
6820 if (*v == NULL) {
6821 /* Create unicode object */
6822 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6823 PyErr_NoMemory();
6824 goto error;
6825 }
Victor Stinnerab595942011-12-17 04:59:06 +01006826 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006827 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006828 if (*v == NULL)
6829 goto error;
6830 startout = PyUnicode_AS_UNICODE(*v);
6831 }
6832 else {
6833 /* Extend unicode object */
6834 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6835 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6836 PyErr_NoMemory();
6837 goto error;
6838 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006839 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006840 goto error;
6841 startout = PyUnicode_AS_UNICODE(*v) + n;
6842 }
6843
6844 /* Decode the byte string character per character */
6845 out = startout;
6846 while (in < endin)
6847 {
6848 /* Decode a character */
6849 insize = 1;
6850 do
6851 {
6852 outsize = MultiByteToWideChar(code_page, flags,
6853 in, insize,
6854 buffer, Py_ARRAY_LENGTH(buffer));
6855 if (outsize > 0)
6856 break;
6857 err = GetLastError();
6858 if (err != ERROR_NO_UNICODE_TRANSLATION
6859 && err != ERROR_INSUFFICIENT_BUFFER)
6860 {
6861 PyErr_SetFromWindowsErr(0);
6862 goto error;
6863 }
6864 insize++;
6865 }
6866 /* 4=maximum length of a UTF-8 sequence */
6867 while (insize <= 4 && (in + insize) <= endin);
6868
6869 if (outsize <= 0) {
6870 Py_ssize_t startinpos, endinpos, outpos;
6871
6872 startinpos = in - startin;
6873 endinpos = startinpos + 1;
6874 outpos = out - PyUnicode_AS_UNICODE(*v);
6875 if (unicode_decode_call_errorhandler(
6876 errors, &errorHandler,
6877 encoding, reason,
6878 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01006879 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02006880 {
6881 goto error;
6882 }
Victor Stinner596a6c42011-11-09 00:02:18 +01006883 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02006884 }
6885 else {
6886 in += insize;
6887 memcpy(out, buffer, outsize * sizeof(wchar_t));
6888 out += outsize;
6889 }
6890 }
6891
6892 /* write a NUL character at the end */
6893 *out = 0;
6894
6895 /* Extend unicode object */
6896 outsize = out - startout;
6897 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01006898 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006899 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01006900 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02006901
6902error:
6903 Py_XDECREF(encoding_obj);
6904 Py_XDECREF(errorHandler);
6905 Py_XDECREF(exc);
6906 return ret;
6907}
6908
Victor Stinner3a50e702011-10-18 21:21:00 +02006909static PyObject *
6910decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006911 const char *s, Py_ssize_t size,
6912 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006913{
Victor Stinner76a31a62011-11-04 00:05:13 +01006914 PyObject *v = NULL;
6915 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006916
Victor Stinner3a50e702011-10-18 21:21:00 +02006917 if (code_page < 0) {
6918 PyErr_SetString(PyExc_ValueError, "invalid code page number");
6919 return NULL;
6920 }
6921
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006922 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006923 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006924
Victor Stinner76a31a62011-11-04 00:05:13 +01006925 do
6926 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006927#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01006928 if (size > INT_MAX) {
6929 chunk_size = INT_MAX;
6930 final = 0;
6931 done = 0;
6932 }
6933 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006934#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01006935 {
6936 chunk_size = (int)size;
6937 final = (consumed == NULL);
6938 done = 1;
6939 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006940
Victor Stinner76a31a62011-11-04 00:05:13 +01006941 /* Skip trailing lead-byte unless 'final' is set */
6942 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
6943 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006944
Victor Stinner76a31a62011-11-04 00:05:13 +01006945 if (chunk_size == 0 && done) {
6946 if (v != NULL)
6947 break;
6948 Py_INCREF(unicode_empty);
6949 return unicode_empty;
6950 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006951
Victor Stinner76a31a62011-11-04 00:05:13 +01006952
6953 converted = decode_code_page_strict(code_page, &v,
6954 s, chunk_size);
6955 if (converted == -2)
6956 converted = decode_code_page_errors(code_page, &v,
6957 s, chunk_size,
6958 errors);
6959 assert(converted != 0);
6960
6961 if (converted < 0) {
6962 Py_XDECREF(v);
6963 return NULL;
6964 }
6965
6966 if (consumed)
6967 *consumed += converted;
6968
6969 s += converted;
6970 size -= converted;
6971 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02006972
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006973 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006974}
6975
Alexander Belopolsky40018472011-02-26 01:02:56 +00006976PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02006977PyUnicode_DecodeCodePageStateful(int code_page,
6978 const char *s,
6979 Py_ssize_t size,
6980 const char *errors,
6981 Py_ssize_t *consumed)
6982{
6983 return decode_code_page_stateful(code_page, s, size, errors, consumed);
6984}
6985
6986PyObject *
6987PyUnicode_DecodeMBCSStateful(const char *s,
6988 Py_ssize_t size,
6989 const char *errors,
6990 Py_ssize_t *consumed)
6991{
6992 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
6993}
6994
6995PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006996PyUnicode_DecodeMBCS(const char *s,
6997 Py_ssize_t size,
6998 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006999{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007000 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7001}
7002
Victor Stinner3a50e702011-10-18 21:21:00 +02007003static DWORD
7004encode_code_page_flags(UINT code_page, const char *errors)
7005{
7006 if (code_page == CP_UTF8) {
7007 if (winver.dwMajorVersion >= 6)
7008 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7009 and later */
7010 return WC_ERR_INVALID_CHARS;
7011 else
7012 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7013 return 0;
7014 }
7015 else if (code_page == CP_UTF7) {
7016 /* CP_UTF7 only supports flags=0 */
7017 return 0;
7018 }
7019 else {
7020 if (errors != NULL && strcmp(errors, "replace") == 0)
7021 return 0;
7022 else
7023 return WC_NO_BEST_FIT_CHARS;
7024 }
7025}
7026
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007027/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007028 * Encode a Unicode string to a Windows code page into a byte string in strict
7029 * mode.
7030 *
7031 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7032 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007033 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007034static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007035encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007036 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007037 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007038{
Victor Stinner554f3f02010-06-16 23:33:54 +00007039 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007040 BOOL *pusedDefaultChar = &usedDefaultChar;
7041 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007042 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007043 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007044 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007045 const DWORD flags = encode_code_page_flags(code_page, NULL);
7046 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007047 /* Create a substring so that we can get the UTF-16 representation
7048 of just the slice under consideration. */
7049 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007050
Martin v. Löwis3d325192011-11-04 18:23:06 +01007051 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007052
Victor Stinner3a50e702011-10-18 21:21:00 +02007053 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007054 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007055 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007056 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007057
Victor Stinner2fc507f2011-11-04 20:06:39 +01007058 substring = PyUnicode_Substring(unicode, offset, offset+len);
7059 if (substring == NULL)
7060 return -1;
7061 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7062 if (p == NULL) {
7063 Py_DECREF(substring);
7064 return -1;
7065 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01007066
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007067 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007068 outsize = WideCharToMultiByte(code_page, flags,
7069 p, size,
7070 NULL, 0,
7071 NULL, pusedDefaultChar);
7072 if (outsize <= 0)
7073 goto error;
7074 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007075 if (pusedDefaultChar && *pusedDefaultChar) {
7076 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007077 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007078 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007079
Victor Stinner3a50e702011-10-18 21:21:00 +02007080 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007081 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007082 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007083 if (*outbytes == NULL) {
7084 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007085 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007086 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007087 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007088 }
7089 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007090 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007091 const Py_ssize_t n = PyBytes_Size(*outbytes);
7092 if (outsize > PY_SSIZE_T_MAX - n) {
7093 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007094 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007095 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007096 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007097 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7098 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007099 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007100 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007101 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007102 }
7103
7104 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007105 outsize = WideCharToMultiByte(code_page, flags,
7106 p, size,
7107 out, outsize,
7108 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007109 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007110 if (outsize <= 0)
7111 goto error;
7112 if (pusedDefaultChar && *pusedDefaultChar)
7113 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007114 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007115
Victor Stinner3a50e702011-10-18 21:21:00 +02007116error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007117 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007118 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7119 return -2;
7120 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007121 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007122}
7123
Victor Stinner3a50e702011-10-18 21:21:00 +02007124/*
7125 * Encode a Unicode string to a Windows code page into a byte string using a
7126 * error handler.
7127 *
7128 * Returns consumed characters if succeed, or raise a WindowsError and returns
7129 * -1 on other error.
7130 */
7131static int
7132encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007133 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007134 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007135{
Victor Stinner3a50e702011-10-18 21:21:00 +02007136 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007137 Py_ssize_t pos = unicode_offset;
7138 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007139 /* Ideally, we should get reason from FormatMessage. This is the Windows
7140 2000 English version of the message. */
7141 const char *reason = "invalid character";
7142 /* 4=maximum length of a UTF-8 sequence */
7143 char buffer[4];
7144 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7145 Py_ssize_t outsize;
7146 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007147 PyObject *errorHandler = NULL;
7148 PyObject *exc = NULL;
7149 PyObject *encoding_obj = NULL;
7150 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007151 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007152 PyObject *rep;
7153 int ret = -1;
7154
7155 assert(insize > 0);
7156
7157 encoding = code_page_name(code_page, &encoding_obj);
7158 if (encoding == NULL)
7159 return -1;
7160
7161 if (errors == NULL || strcmp(errors, "strict") == 0) {
7162 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7163 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007164 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007165 if (exc != NULL) {
7166 PyCodec_StrictErrors(exc);
7167 Py_DECREF(exc);
7168 }
7169 Py_XDECREF(encoding_obj);
7170 return -1;
7171 }
7172
7173 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7174 pusedDefaultChar = &usedDefaultChar;
7175 else
7176 pusedDefaultChar = NULL;
7177
7178 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7179 PyErr_NoMemory();
7180 goto error;
7181 }
7182 outsize = insize * Py_ARRAY_LENGTH(buffer);
7183
7184 if (*outbytes == NULL) {
7185 /* Create string object */
7186 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7187 if (*outbytes == NULL)
7188 goto error;
7189 out = PyBytes_AS_STRING(*outbytes);
7190 }
7191 else {
7192 /* Extend string object */
7193 Py_ssize_t n = PyBytes_Size(*outbytes);
7194 if (n > PY_SSIZE_T_MAX - outsize) {
7195 PyErr_NoMemory();
7196 goto error;
7197 }
7198 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7199 goto error;
7200 out = PyBytes_AS_STRING(*outbytes) + n;
7201 }
7202
7203 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007204 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007205 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007206 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7207 wchar_t chars[2];
7208 int charsize;
7209 if (ch < 0x10000) {
7210 chars[0] = (wchar_t)ch;
7211 charsize = 1;
7212 }
7213 else {
7214 ch -= 0x10000;
7215 chars[0] = 0xd800 + (ch >> 10);
7216 chars[1] = 0xdc00 + (ch & 0x3ff);
7217 charsize = 2;
7218 }
7219
Victor Stinner3a50e702011-10-18 21:21:00 +02007220 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007221 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007222 buffer, Py_ARRAY_LENGTH(buffer),
7223 NULL, pusedDefaultChar);
7224 if (outsize > 0) {
7225 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7226 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007227 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007228 memcpy(out, buffer, outsize);
7229 out += outsize;
7230 continue;
7231 }
7232 }
7233 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7234 PyErr_SetFromWindowsErr(0);
7235 goto error;
7236 }
7237
Victor Stinner3a50e702011-10-18 21:21:00 +02007238 rep = unicode_encode_call_errorhandler(
7239 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007240 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007241 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007242 if (rep == NULL)
7243 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007244 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007245
7246 if (PyBytes_Check(rep)) {
7247 outsize = PyBytes_GET_SIZE(rep);
7248 if (outsize != 1) {
7249 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7250 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7251 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7252 Py_DECREF(rep);
7253 goto error;
7254 }
7255 out = PyBytes_AS_STRING(*outbytes) + offset;
7256 }
7257 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7258 out += outsize;
7259 }
7260 else {
7261 Py_ssize_t i;
7262 enum PyUnicode_Kind kind;
7263 void *data;
7264
Benjamin Petersonbac79492012-01-14 13:34:47 -05007265 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007266 Py_DECREF(rep);
7267 goto error;
7268 }
7269
7270 outsize = PyUnicode_GET_LENGTH(rep);
7271 if (outsize != 1) {
7272 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7273 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7274 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7275 Py_DECREF(rep);
7276 goto error;
7277 }
7278 out = PyBytes_AS_STRING(*outbytes) + offset;
7279 }
7280 kind = PyUnicode_KIND(rep);
7281 data = PyUnicode_DATA(rep);
7282 for (i=0; i < outsize; i++) {
7283 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7284 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007285 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007286 encoding, unicode,
7287 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007288 "unable to encode error handler result to ASCII");
7289 Py_DECREF(rep);
7290 goto error;
7291 }
7292 *out = (unsigned char)ch;
7293 out++;
7294 }
7295 }
7296 Py_DECREF(rep);
7297 }
7298 /* write a NUL byte */
7299 *out = 0;
7300 outsize = out - PyBytes_AS_STRING(*outbytes);
7301 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7302 if (_PyBytes_Resize(outbytes, outsize) < 0)
7303 goto error;
7304 ret = 0;
7305
7306error:
7307 Py_XDECREF(encoding_obj);
7308 Py_XDECREF(errorHandler);
7309 Py_XDECREF(exc);
7310 return ret;
7311}
7312
Victor Stinner3a50e702011-10-18 21:21:00 +02007313static PyObject *
7314encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007315 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007316 const char *errors)
7317{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007318 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007319 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007320 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007321 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007322
Benjamin Petersonbac79492012-01-14 13:34:47 -05007323 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007324 return NULL;
7325 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007326
Victor Stinner3a50e702011-10-18 21:21:00 +02007327 if (code_page < 0) {
7328 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7329 return NULL;
7330 }
7331
Martin v. Löwis3d325192011-11-04 18:23:06 +01007332 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007333 return PyBytes_FromStringAndSize(NULL, 0);
7334
Victor Stinner7581cef2011-11-03 22:32:33 +01007335 offset = 0;
7336 do
7337 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007338#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007339 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007340 chunks. */
7341 if (len > INT_MAX/2) {
7342 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007343 done = 0;
7344 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007345 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007346#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007347 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007348 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007349 done = 1;
7350 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007351
Victor Stinner76a31a62011-11-04 00:05:13 +01007352 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007353 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007354 errors);
7355 if (ret == -2)
7356 ret = encode_code_page_errors(code_page, &outbytes,
7357 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007358 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007359 if (ret < 0) {
7360 Py_XDECREF(outbytes);
7361 return NULL;
7362 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007363
Victor Stinner7581cef2011-11-03 22:32:33 +01007364 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007365 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007366 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007367
Victor Stinner3a50e702011-10-18 21:21:00 +02007368 return outbytes;
7369}
7370
7371PyObject *
7372PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7373 Py_ssize_t size,
7374 const char *errors)
7375{
Victor Stinner7581cef2011-11-03 22:32:33 +01007376 PyObject *unicode, *res;
7377 unicode = PyUnicode_FromUnicode(p, size);
7378 if (unicode == NULL)
7379 return NULL;
7380 res = encode_code_page(CP_ACP, unicode, errors);
7381 Py_DECREF(unicode);
7382 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007383}
7384
7385PyObject *
7386PyUnicode_EncodeCodePage(int code_page,
7387 PyObject *unicode,
7388 const char *errors)
7389{
Victor Stinner7581cef2011-11-03 22:32:33 +01007390 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007391}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007392
Alexander Belopolsky40018472011-02-26 01:02:56 +00007393PyObject *
7394PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007395{
7396 if (!PyUnicode_Check(unicode)) {
7397 PyErr_BadArgument();
7398 return NULL;
7399 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007400 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007401}
7402
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007403#undef NEED_RETRY
7404
Victor Stinner99b95382011-07-04 14:23:54 +02007405#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007406
Guido van Rossumd57fd912000-03-10 22:53:23 +00007407/* --- Character Mapping Codec -------------------------------------------- */
7408
Alexander Belopolsky40018472011-02-26 01:02:56 +00007409PyObject *
7410PyUnicode_DecodeCharmap(const char *s,
7411 Py_ssize_t size,
7412 PyObject *mapping,
7413 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007414{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007415 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007416 Py_ssize_t startinpos;
7417 Py_ssize_t endinpos;
7418 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007419 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007420 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007421 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007422 PyObject *errorHandler = NULL;
7423 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007424
Guido van Rossumd57fd912000-03-10 22:53:23 +00007425 /* Default to Latin-1 */
7426 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007427 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007428
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007429 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007430 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007431 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007432 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007433 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007434 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007435 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007436 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007437 Py_ssize_t maplen;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007438 enum PyUnicode_Kind mapkind;
7439 void *mapdata;
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007440 Py_UCS4 x;
7441
Benjamin Petersonbac79492012-01-14 13:34:47 -05007442 if (PyUnicode_READY(mapping) == -1)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007443 return NULL;
7444
7445 maplen = PyUnicode_GET_LENGTH(mapping);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007446 mapdata = PyUnicode_DATA(mapping);
7447 mapkind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007448 while (s < e) {
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007449 unsigned char ch;
7450 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7451 enum PyUnicode_Kind outkind = PyUnicode_KIND(v);
7452 if (outkind == PyUnicode_1BYTE_KIND) {
7453 void *outdata = PyUnicode_DATA(v);
7454 Py_UCS4 maxchar = PyUnicode_MAX_CHAR_VALUE(v);
7455 while (s < e) {
7456 unsigned char ch = *s;
7457 x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch);
7458 if (x > maxchar)
7459 goto Error;
7460 PyUnicode_WRITE(PyUnicode_1BYTE_KIND, outdata, outpos++, x);
7461 ++s;
7462 }
7463 break;
7464 }
7465 else if (outkind == PyUnicode_2BYTE_KIND) {
7466 void *outdata = PyUnicode_DATA(v);
7467 while (s < e) {
7468 unsigned char ch = *s;
7469 x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch);
7470 if (x == 0xFFFE)
7471 goto Error;
7472 PyUnicode_WRITE(PyUnicode_2BYTE_KIND, outdata, outpos++, x);
7473 ++s;
7474 }
7475 break;
7476 }
7477 }
7478 ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007479
Benjamin Peterson29060642009-01-31 22:14:21 +00007480 if (ch < maplen)
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007481 x = PyUnicode_READ(mapkind, mapdata, ch);
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007482 else
7483 x = 0xfffe; /* invalid value */
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007484Error:
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007485 if (x == 0xfffe)
7486 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007487 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007488 startinpos = s-starts;
7489 endinpos = startinpos+1;
7490 if (unicode_decode_call_errorhandler(
7491 errors, &errorHandler,
7492 "charmap", "character maps to <undefined>",
7493 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007494 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007495 goto onError;
7496 }
7497 continue;
7498 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007499
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007500 if (unicode_putchar(&v, &outpos, x) < 0)
7501 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007502 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007503 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007504 }
7505 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007506 while (s < e) {
7507 unsigned char ch = *s;
7508 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007509
Benjamin Peterson29060642009-01-31 22:14:21 +00007510 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7511 w = PyLong_FromLong((long)ch);
7512 if (w == NULL)
7513 goto onError;
7514 x = PyObject_GetItem(mapping, w);
7515 Py_DECREF(w);
7516 if (x == NULL) {
7517 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7518 /* No mapping found means: mapping is undefined. */
7519 PyErr_Clear();
7520 x = Py_None;
7521 Py_INCREF(x);
7522 } else
7523 goto onError;
7524 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007525
Benjamin Peterson29060642009-01-31 22:14:21 +00007526 /* Apply mapping */
7527 if (PyLong_Check(x)) {
7528 long value = PyLong_AS_LONG(x);
7529 if (value < 0 || value > 65535) {
7530 PyErr_SetString(PyExc_TypeError,
7531 "character mapping must be in range(65536)");
7532 Py_DECREF(x);
7533 goto onError;
7534 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007535 if (unicode_putchar(&v, &outpos, value) < 0)
7536 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007537 }
7538 else if (x == Py_None) {
7539 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007540 startinpos = s-starts;
7541 endinpos = startinpos+1;
7542 if (unicode_decode_call_errorhandler(
7543 errors, &errorHandler,
7544 "charmap", "character maps to <undefined>",
7545 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007546 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007547 Py_DECREF(x);
7548 goto onError;
7549 }
7550 Py_DECREF(x);
7551 continue;
7552 }
7553 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007554 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007555
Benjamin Petersonbac79492012-01-14 13:34:47 -05007556 if (PyUnicode_READY(x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007557 goto onError;
7558 targetsize = PyUnicode_GET_LENGTH(x);
7559
7560 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007561 /* 1-1 mapping */
Victor Stinner62aa4d02011-11-09 00:03:45 +01007562 if (unicode_putchar(&v, &outpos,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007563 PyUnicode_READ_CHAR(x, 0)) < 0)
7564 goto onError;
7565 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007566 else if (targetsize > 1) {
7567 /* 1-n mapping */
7568 if (targetsize > extrachars) {
7569 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007570 Py_ssize_t needed = (targetsize - extrachars) + \
7571 (targetsize << 2);
7572 extrachars += needed;
7573 /* XXX overflow detection missing */
Victor Stinner16e6a802011-12-12 13:24:15 +01007574 if (unicode_resize(&v,
7575 PyUnicode_GET_LENGTH(v) + needed) < 0)
7576 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007577 Py_DECREF(x);
7578 goto onError;
7579 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007580 }
Victor Stinner1b487b42012-05-03 12:29:04 +02007581 if (unicode_widen(&v, outpos, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007582 goto onError;
7583 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7584 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00007585 extrachars -= targetsize;
7586 }
7587 /* 1-0 mapping: skip the character */
7588 }
7589 else {
7590 /* wrong return value */
7591 PyErr_SetString(PyExc_TypeError,
7592 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007593 Py_DECREF(x);
7594 goto onError;
7595 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007596 Py_DECREF(x);
7597 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007598 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007599 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007600 if (unicode_resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01007601 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007602 Py_XDECREF(errorHandler);
7603 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007604 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00007605
Benjamin Peterson29060642009-01-31 22:14:21 +00007606 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007607 Py_XDECREF(errorHandler);
7608 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007609 Py_XDECREF(v);
7610 return NULL;
7611}
7612
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007613/* Charmap encoding: the lookup table */
7614
Alexander Belopolsky40018472011-02-26 01:02:56 +00007615struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007616 PyObject_HEAD
7617 unsigned char level1[32];
7618 int count2, count3;
7619 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007620};
7621
7622static PyObject*
7623encoding_map_size(PyObject *obj, PyObject* args)
7624{
7625 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007626 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007627 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007628}
7629
7630static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007631 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007632 PyDoc_STR("Return the size (in bytes) of this object") },
7633 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007634};
7635
7636static void
7637encoding_map_dealloc(PyObject* o)
7638{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007639 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007640}
7641
7642static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007643 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007644 "EncodingMap", /*tp_name*/
7645 sizeof(struct encoding_map), /*tp_basicsize*/
7646 0, /*tp_itemsize*/
7647 /* methods */
7648 encoding_map_dealloc, /*tp_dealloc*/
7649 0, /*tp_print*/
7650 0, /*tp_getattr*/
7651 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007652 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007653 0, /*tp_repr*/
7654 0, /*tp_as_number*/
7655 0, /*tp_as_sequence*/
7656 0, /*tp_as_mapping*/
7657 0, /*tp_hash*/
7658 0, /*tp_call*/
7659 0, /*tp_str*/
7660 0, /*tp_getattro*/
7661 0, /*tp_setattro*/
7662 0, /*tp_as_buffer*/
7663 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7664 0, /*tp_doc*/
7665 0, /*tp_traverse*/
7666 0, /*tp_clear*/
7667 0, /*tp_richcompare*/
7668 0, /*tp_weaklistoffset*/
7669 0, /*tp_iter*/
7670 0, /*tp_iternext*/
7671 encoding_map_methods, /*tp_methods*/
7672 0, /*tp_members*/
7673 0, /*tp_getset*/
7674 0, /*tp_base*/
7675 0, /*tp_dict*/
7676 0, /*tp_descr_get*/
7677 0, /*tp_descr_set*/
7678 0, /*tp_dictoffset*/
7679 0, /*tp_init*/
7680 0, /*tp_alloc*/
7681 0, /*tp_new*/
7682 0, /*tp_free*/
7683 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007684};
7685
7686PyObject*
7687PyUnicode_BuildEncodingMap(PyObject* string)
7688{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007689 PyObject *result;
7690 struct encoding_map *mresult;
7691 int i;
7692 int need_dict = 0;
7693 unsigned char level1[32];
7694 unsigned char level2[512];
7695 unsigned char *mlevel1, *mlevel2, *mlevel3;
7696 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007697 int kind;
7698 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007699 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007700 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007701
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007702 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007703 PyErr_BadArgument();
7704 return NULL;
7705 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007706 kind = PyUnicode_KIND(string);
7707 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007708 length = PyUnicode_GET_LENGTH(string);
7709 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007710 memset(level1, 0xFF, sizeof level1);
7711 memset(level2, 0xFF, sizeof level2);
7712
7713 /* If there isn't a one-to-one mapping of NULL to \0,
7714 or if there are non-BMP characters, we need to use
7715 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007716 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007717 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007718 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007719 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007720 ch = PyUnicode_READ(kind, data, i);
7721 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007722 need_dict = 1;
7723 break;
7724 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007725 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007726 /* unmapped character */
7727 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007728 l1 = ch >> 11;
7729 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007730 if (level1[l1] == 0xFF)
7731 level1[l1] = count2++;
7732 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007733 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007734 }
7735
7736 if (count2 >= 0xFF || count3 >= 0xFF)
7737 need_dict = 1;
7738
7739 if (need_dict) {
7740 PyObject *result = PyDict_New();
7741 PyObject *key, *value;
7742 if (!result)
7743 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007744 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007745 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007746 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007747 if (!key || !value)
7748 goto failed1;
7749 if (PyDict_SetItem(result, key, value) == -1)
7750 goto failed1;
7751 Py_DECREF(key);
7752 Py_DECREF(value);
7753 }
7754 return result;
7755 failed1:
7756 Py_XDECREF(key);
7757 Py_XDECREF(value);
7758 Py_DECREF(result);
7759 return NULL;
7760 }
7761
7762 /* Create a three-level trie */
7763 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7764 16*count2 + 128*count3 - 1);
7765 if (!result)
7766 return PyErr_NoMemory();
7767 PyObject_Init(result, &EncodingMapType);
7768 mresult = (struct encoding_map*)result;
7769 mresult->count2 = count2;
7770 mresult->count3 = count3;
7771 mlevel1 = mresult->level1;
7772 mlevel2 = mresult->level23;
7773 mlevel3 = mresult->level23 + 16*count2;
7774 memcpy(mlevel1, level1, 32);
7775 memset(mlevel2, 0xFF, 16*count2);
7776 memset(mlevel3, 0, 128*count3);
7777 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007778 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007779 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007780 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7781 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007782 /* unmapped character */
7783 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007784 o1 = ch>>11;
7785 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007786 i2 = 16*mlevel1[o1] + o2;
7787 if (mlevel2[i2] == 0xFF)
7788 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007789 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007790 i3 = 128*mlevel2[i2] + o3;
7791 mlevel3[i3] = i;
7792 }
7793 return result;
7794}
7795
7796static int
Victor Stinner22168992011-11-20 17:09:18 +01007797encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007798{
7799 struct encoding_map *map = (struct encoding_map*)mapping;
7800 int l1 = c>>11;
7801 int l2 = (c>>7) & 0xF;
7802 int l3 = c & 0x7F;
7803 int i;
7804
Victor Stinner22168992011-11-20 17:09:18 +01007805 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007806 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007807 if (c == 0)
7808 return 0;
7809 /* level 1*/
7810 i = map->level1[l1];
7811 if (i == 0xFF) {
7812 return -1;
7813 }
7814 /* level 2*/
7815 i = map->level23[16*i+l2];
7816 if (i == 0xFF) {
7817 return -1;
7818 }
7819 /* level 3 */
7820 i = map->level23[16*map->count2 + 128*i + l3];
7821 if (i == 0) {
7822 return -1;
7823 }
7824 return i;
7825}
7826
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007827/* Lookup the character ch in the mapping. If the character
7828 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007829 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007830static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007831charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007832{
Christian Heimes217cfd12007-12-02 14:31:20 +00007833 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007834 PyObject *x;
7835
7836 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007837 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007838 x = PyObject_GetItem(mapping, w);
7839 Py_DECREF(w);
7840 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007841 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7842 /* No mapping found means: mapping is undefined. */
7843 PyErr_Clear();
7844 x = Py_None;
7845 Py_INCREF(x);
7846 return x;
7847 } else
7848 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007849 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007850 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007851 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007852 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007853 long value = PyLong_AS_LONG(x);
7854 if (value < 0 || value > 255) {
7855 PyErr_SetString(PyExc_TypeError,
7856 "character mapping must be in range(256)");
7857 Py_DECREF(x);
7858 return NULL;
7859 }
7860 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007861 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007862 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007863 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007864 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007865 /* wrong return value */
7866 PyErr_Format(PyExc_TypeError,
7867 "character mapping must return integer, bytes or None, not %.400s",
7868 x->ob_type->tp_name);
7869 Py_DECREF(x);
7870 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007871 }
7872}
7873
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007874static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007875charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007876{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007877 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7878 /* exponentially overallocate to minimize reallocations */
7879 if (requiredsize < 2*outsize)
7880 requiredsize = 2*outsize;
7881 if (_PyBytes_Resize(outobj, requiredsize))
7882 return -1;
7883 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007884}
7885
Benjamin Peterson14339b62009-01-31 16:36:08 +00007886typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007887 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007888} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007889/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007890 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007891 space is available. Return a new reference to the object that
7892 was put in the output buffer, or Py_None, if the mapping was undefined
7893 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007894 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007895static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01007896charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007897 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007898{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007899 PyObject *rep;
7900 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007901 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007902
Christian Heimes90aa7642007-12-19 02:45:37 +00007903 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007904 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007905 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007906 if (res == -1)
7907 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007908 if (outsize<requiredsize)
7909 if (charmapencode_resize(outobj, outpos, requiredsize))
7910 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007911 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007912 outstart[(*outpos)++] = (char)res;
7913 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007914 }
7915
7916 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007917 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007918 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007919 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007920 Py_DECREF(rep);
7921 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007922 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007923 if (PyLong_Check(rep)) {
7924 Py_ssize_t requiredsize = *outpos+1;
7925 if (outsize<requiredsize)
7926 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7927 Py_DECREF(rep);
7928 return enc_EXCEPTION;
7929 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007930 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007931 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007932 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007933 else {
7934 const char *repchars = PyBytes_AS_STRING(rep);
7935 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7936 Py_ssize_t requiredsize = *outpos+repsize;
7937 if (outsize<requiredsize)
7938 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7939 Py_DECREF(rep);
7940 return enc_EXCEPTION;
7941 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007942 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007943 memcpy(outstart + *outpos, repchars, repsize);
7944 *outpos += repsize;
7945 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007946 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007947 Py_DECREF(rep);
7948 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007949}
7950
7951/* handle an error in PyUnicode_EncodeCharmap
7952 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007953static int
7954charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007955 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007956 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007957 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007958 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007959{
7960 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007961 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007962 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007963 enum PyUnicode_Kind kind;
7964 void *data;
7965 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007966 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007967 Py_ssize_t collstartpos = *inpos;
7968 Py_ssize_t collendpos = *inpos+1;
7969 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007970 char *encoding = "charmap";
7971 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007972 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007973 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05007974 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007975
Benjamin Petersonbac79492012-01-14 13:34:47 -05007976 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007977 return -1;
7978 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007979 /* find all unencodable characters */
7980 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007981 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007982 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007983 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05007984 val = encoding_map_lookup(ch, mapping);
7985 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007986 break;
7987 ++collendpos;
7988 continue;
7989 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007990
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007991 ch = PyUnicode_READ_CHAR(unicode, collendpos);
7992 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007993 if (rep==NULL)
7994 return -1;
7995 else if (rep!=Py_None) {
7996 Py_DECREF(rep);
7997 break;
7998 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007999 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008000 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008001 }
8002 /* cache callback name lookup
8003 * (if not done yet, i.e. it's the first error) */
8004 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008005 if ((errors==NULL) || (!strcmp(errors, "strict")))
8006 *known_errorHandler = 1;
8007 else if (!strcmp(errors, "replace"))
8008 *known_errorHandler = 2;
8009 else if (!strcmp(errors, "ignore"))
8010 *known_errorHandler = 3;
8011 else if (!strcmp(errors, "xmlcharrefreplace"))
8012 *known_errorHandler = 4;
8013 else
8014 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008015 }
8016 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008017 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008018 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008019 return -1;
8020 case 2: /* replace */
8021 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008022 x = charmapencode_output('?', mapping, res, respos);
8023 if (x==enc_EXCEPTION) {
8024 return -1;
8025 }
8026 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008027 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008028 return -1;
8029 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008030 }
8031 /* fall through */
8032 case 3: /* ignore */
8033 *inpos = collendpos;
8034 break;
8035 case 4: /* xmlcharrefreplace */
8036 /* generate replacement (temporarily (mis)uses p) */
8037 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008038 char buffer[2+29+1+1];
8039 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008040 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008041 for (cp = buffer; *cp; ++cp) {
8042 x = charmapencode_output(*cp, mapping, res, respos);
8043 if (x==enc_EXCEPTION)
8044 return -1;
8045 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008046 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008047 return -1;
8048 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008049 }
8050 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008051 *inpos = collendpos;
8052 break;
8053 default:
8054 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008055 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008056 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008057 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008058 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008059 if (PyBytes_Check(repunicode)) {
8060 /* Directly copy bytes result to output. */
8061 Py_ssize_t outsize = PyBytes_Size(*res);
8062 Py_ssize_t requiredsize;
8063 repsize = PyBytes_Size(repunicode);
8064 requiredsize = *respos + repsize;
8065 if (requiredsize > outsize)
8066 /* Make room for all additional bytes. */
8067 if (charmapencode_resize(res, respos, requiredsize)) {
8068 Py_DECREF(repunicode);
8069 return -1;
8070 }
8071 memcpy(PyBytes_AsString(*res) + *respos,
8072 PyBytes_AsString(repunicode), repsize);
8073 *respos += repsize;
8074 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008075 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008076 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008077 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008078 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008079 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008080 Py_DECREF(repunicode);
8081 return -1;
8082 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008083 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008084 data = PyUnicode_DATA(repunicode);
8085 kind = PyUnicode_KIND(repunicode);
8086 for (index = 0; index < repsize; index++) {
8087 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8088 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008089 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008090 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008091 return -1;
8092 }
8093 else if (x==enc_FAILED) {
8094 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008095 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008096 return -1;
8097 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008098 }
8099 *inpos = newpos;
8100 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008101 }
8102 return 0;
8103}
8104
Alexander Belopolsky40018472011-02-26 01:02:56 +00008105PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008106_PyUnicode_EncodeCharmap(PyObject *unicode,
8107 PyObject *mapping,
8108 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008109{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008110 /* output object */
8111 PyObject *res = NULL;
8112 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008113 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008114 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008115 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008116 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008117 PyObject *errorHandler = NULL;
8118 PyObject *exc = NULL;
8119 /* the following variable is used for caching string comparisons
8120 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8121 * 3=ignore, 4=xmlcharrefreplace */
8122 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008123
Benjamin Petersonbac79492012-01-14 13:34:47 -05008124 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008125 return NULL;
8126 size = PyUnicode_GET_LENGTH(unicode);
8127
Guido van Rossumd57fd912000-03-10 22:53:23 +00008128 /* Default to Latin-1 */
8129 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008130 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008131
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008132 /* allocate enough for a simple encoding without
8133 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008134 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008135 if (res == NULL)
8136 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008137 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008138 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008139
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008140 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008141 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008142 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008143 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008144 if (x==enc_EXCEPTION) /* error */
8145 goto onError;
8146 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008147 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008148 &exc,
8149 &known_errorHandler, &errorHandler, errors,
8150 &res, &respos)) {
8151 goto onError;
8152 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008153 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008154 else
8155 /* done with this character => adjust input position */
8156 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008157 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008158
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008159 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008160 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008161 if (_PyBytes_Resize(&res, respos) < 0)
8162 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008163
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008164 Py_XDECREF(exc);
8165 Py_XDECREF(errorHandler);
8166 return res;
8167
Benjamin Peterson29060642009-01-31 22:14:21 +00008168 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008169 Py_XDECREF(res);
8170 Py_XDECREF(exc);
8171 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008172 return NULL;
8173}
8174
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008175/* Deprecated */
8176PyObject *
8177PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8178 Py_ssize_t size,
8179 PyObject *mapping,
8180 const char *errors)
8181{
8182 PyObject *result;
8183 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8184 if (unicode == NULL)
8185 return NULL;
8186 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8187 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008188 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008189}
8190
Alexander Belopolsky40018472011-02-26 01:02:56 +00008191PyObject *
8192PyUnicode_AsCharmapString(PyObject *unicode,
8193 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008194{
8195 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008196 PyErr_BadArgument();
8197 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008198 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008199 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008200}
8201
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008202/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008203static void
8204make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008205 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008206 Py_ssize_t startpos, Py_ssize_t endpos,
8207 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008208{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008209 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008210 *exceptionObject = _PyUnicodeTranslateError_Create(
8211 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008212 }
8213 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008214 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8215 goto onError;
8216 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8217 goto onError;
8218 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8219 goto onError;
8220 return;
8221 onError:
8222 Py_DECREF(*exceptionObject);
8223 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008224 }
8225}
8226
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008227/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008228static void
8229raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008230 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008231 Py_ssize_t startpos, Py_ssize_t endpos,
8232 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008233{
8234 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008235 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008236 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008237 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008238}
8239
8240/* error handling callback helper:
8241 build arguments, call the callback and check the arguments,
8242 put the result into newpos and return the replacement string, which
8243 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008244static PyObject *
8245unicode_translate_call_errorhandler(const char *errors,
8246 PyObject **errorHandler,
8247 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008248 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008249 Py_ssize_t startpos, Py_ssize_t endpos,
8250 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008251{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008252 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008253
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008254 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008255 PyObject *restuple;
8256 PyObject *resunicode;
8257
8258 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008259 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008260 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008261 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008262 }
8263
8264 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008265 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008266 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008267 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008268
8269 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008270 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008271 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008272 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008273 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008274 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008275 Py_DECREF(restuple);
8276 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008277 }
8278 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008279 &resunicode, &i_newpos)) {
8280 Py_DECREF(restuple);
8281 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008282 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008283 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008284 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008285 else
8286 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008287 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008288 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8289 Py_DECREF(restuple);
8290 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008291 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008292 Py_INCREF(resunicode);
8293 Py_DECREF(restuple);
8294 return resunicode;
8295}
8296
8297/* Lookup the character ch in the mapping and put the result in result,
8298 which must be decrefed by the caller.
8299 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008300static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008301charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008302{
Christian Heimes217cfd12007-12-02 14:31:20 +00008303 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008304 PyObject *x;
8305
8306 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008307 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008308 x = PyObject_GetItem(mapping, w);
8309 Py_DECREF(w);
8310 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008311 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8312 /* No mapping found means: use 1:1 mapping. */
8313 PyErr_Clear();
8314 *result = NULL;
8315 return 0;
8316 } else
8317 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008318 }
8319 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008320 *result = x;
8321 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008322 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008323 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008324 long value = PyLong_AS_LONG(x);
8325 long max = PyUnicode_GetMax();
8326 if (value < 0 || value > max) {
8327 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008328 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008329 Py_DECREF(x);
8330 return -1;
8331 }
8332 *result = x;
8333 return 0;
8334 }
8335 else if (PyUnicode_Check(x)) {
8336 *result = x;
8337 return 0;
8338 }
8339 else {
8340 /* wrong return value */
8341 PyErr_SetString(PyExc_TypeError,
8342 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008343 Py_DECREF(x);
8344 return -1;
8345 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008346}
8347/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008348 if not reallocate and adjust various state variables.
8349 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008350static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008351charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008352 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008353{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008354 Py_ssize_t oldsize = *psize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008355 Py_UCS4 *new_outobj;
Walter Dörwald4894c302003-10-24 14:25:28 +00008356 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008357 /* exponentially overallocate to minimize reallocations */
8358 if (requiredsize < 2 * oldsize)
8359 requiredsize = 2 * oldsize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008360 new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8361 if (new_outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008362 return -1;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008363 *outobj = new_outobj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008364 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008365 }
8366 return 0;
8367}
8368/* lookup the character, put the result in the output string and adjust
8369 various state variables. Return a new reference to the object that
8370 was put in the output buffer in *result, or Py_None, if the mapping was
8371 undefined (in which case no character was written).
8372 The called must decref result.
8373 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008374static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008375charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8376 PyObject *mapping, Py_UCS4 **output,
8377 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008378 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008379{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008380 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8381 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008382 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008383 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008384 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008385 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008386 }
8387 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008388 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008389 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008390 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008391 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008392 }
8393 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008394 Py_ssize_t repsize;
8395 if (PyUnicode_READY(*res) == -1)
8396 return -1;
8397 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008398 if (repsize==1) {
8399 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008400 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008401 }
8402 else if (repsize!=0) {
8403 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008404 Py_ssize_t requiredsize = *opos +
8405 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008406 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008407 Py_ssize_t i;
8408 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008409 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008410 for(i = 0; i < repsize; i++)
8411 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008412 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008413 }
8414 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008415 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008416 return 0;
8417}
8418
Alexander Belopolsky40018472011-02-26 01:02:56 +00008419PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008420_PyUnicode_TranslateCharmap(PyObject *input,
8421 PyObject *mapping,
8422 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008423{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008424 /* input object */
8425 char *idata;
8426 Py_ssize_t size, i;
8427 int kind;
8428 /* output buffer */
8429 Py_UCS4 *output = NULL;
8430 Py_ssize_t osize;
8431 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008432 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008433 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008434 char *reason = "character maps to <undefined>";
8435 PyObject *errorHandler = NULL;
8436 PyObject *exc = NULL;
8437 /* the following variable is used for caching string comparisons
8438 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8439 * 3=ignore, 4=xmlcharrefreplace */
8440 int known_errorHandler = -1;
8441
Guido van Rossumd57fd912000-03-10 22:53:23 +00008442 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008443 PyErr_BadArgument();
8444 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008445 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008446
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008447 if (PyUnicode_READY(input) == -1)
8448 return NULL;
8449 idata = (char*)PyUnicode_DATA(input);
8450 kind = PyUnicode_KIND(input);
8451 size = PyUnicode_GET_LENGTH(input);
8452 i = 0;
8453
8454 if (size == 0) {
8455 Py_INCREF(input);
8456 return input;
8457 }
8458
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008459 /* allocate enough for a simple 1:1 translation without
8460 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008461 osize = size;
8462 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8463 opos = 0;
8464 if (output == NULL) {
8465 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008466 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008467 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008468
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008469 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008470 /* try to encode it */
8471 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008472 if (charmaptranslate_output(input, i, mapping,
8473 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008474 Py_XDECREF(x);
8475 goto onError;
8476 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008477 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008478 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008479 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008480 else { /* untranslatable character */
8481 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8482 Py_ssize_t repsize;
8483 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008484 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008485 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008486 Py_ssize_t collstart = i;
8487 Py_ssize_t collend = i+1;
8488 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008489
Benjamin Peterson29060642009-01-31 22:14:21 +00008490 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008491 while (collend < size) {
8492 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008493 goto onError;
8494 Py_XDECREF(x);
8495 if (x!=Py_None)
8496 break;
8497 ++collend;
8498 }
8499 /* cache callback name lookup
8500 * (if not done yet, i.e. it's the first error) */
8501 if (known_errorHandler==-1) {
8502 if ((errors==NULL) || (!strcmp(errors, "strict")))
8503 known_errorHandler = 1;
8504 else if (!strcmp(errors, "replace"))
8505 known_errorHandler = 2;
8506 else if (!strcmp(errors, "ignore"))
8507 known_errorHandler = 3;
8508 else if (!strcmp(errors, "xmlcharrefreplace"))
8509 known_errorHandler = 4;
8510 else
8511 known_errorHandler = 0;
8512 }
8513 switch (known_errorHandler) {
8514 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008515 raise_translate_exception(&exc, input, collstart,
8516 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008517 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008518 case 2: /* replace */
8519 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008520 for (coll = collstart; coll<collend; coll++)
8521 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008522 /* fall through */
8523 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008524 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008525 break;
8526 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008527 /* generate replacement (temporarily (mis)uses i) */
8528 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008529 char buffer[2+29+1+1];
8530 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008531 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8532 if (charmaptranslate_makespace(&output, &osize,
8533 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008534 goto onError;
8535 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008536 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008537 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008538 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008539 break;
8540 default:
8541 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008542 reason, input, &exc,
8543 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008544 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008545 goto onError;
Benjamin Peterson9ca3ffa2012-01-01 16:04:29 -06008546 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008547 Py_DECREF(repunicode);
8548 goto onError;
8549 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008550 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008551 repsize = PyUnicode_GET_LENGTH(repunicode);
8552 if (charmaptranslate_makespace(&output, &osize,
8553 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008554 Py_DECREF(repunicode);
8555 goto onError;
8556 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008557 for (uni2 = 0; repsize-->0; ++uni2)
8558 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8559 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008560 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008561 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008562 }
8563 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008564 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8565 if (!res)
8566 goto onError;
8567 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008568 Py_XDECREF(exc);
8569 Py_XDECREF(errorHandler);
8570 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008571
Benjamin Peterson29060642009-01-31 22:14:21 +00008572 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008573 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008574 Py_XDECREF(exc);
8575 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008576 return NULL;
8577}
8578
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008579/* Deprecated. Use PyUnicode_Translate instead. */
8580PyObject *
8581PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8582 Py_ssize_t size,
8583 PyObject *mapping,
8584 const char *errors)
8585{
8586 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8587 if (!unicode)
8588 return NULL;
8589 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8590}
8591
Alexander Belopolsky40018472011-02-26 01:02:56 +00008592PyObject *
8593PyUnicode_Translate(PyObject *str,
8594 PyObject *mapping,
8595 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008596{
8597 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008598
Guido van Rossumd57fd912000-03-10 22:53:23 +00008599 str = PyUnicode_FromObject(str);
8600 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008601 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008602 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008603 Py_DECREF(str);
8604 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008605
Benjamin Peterson29060642009-01-31 22:14:21 +00008606 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008607 Py_XDECREF(str);
8608 return NULL;
8609}
Tim Petersced69f82003-09-16 20:30:58 +00008610
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008611static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008612fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008613{
8614 /* No need to call PyUnicode_READY(self) because this function is only
8615 called as a callback from fixup() which does it already. */
8616 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8617 const int kind = PyUnicode_KIND(self);
8618 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02008619 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008620 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008621 Py_ssize_t i;
8622
8623 for (i = 0; i < len; ++i) {
8624 ch = PyUnicode_READ(kind, data, i);
8625 fixed = 0;
8626 if (ch > 127) {
8627 if (Py_UNICODE_ISSPACE(ch))
8628 fixed = ' ';
8629 else {
8630 const int decimal = Py_UNICODE_TODECIMAL(ch);
8631 if (decimal >= 0)
8632 fixed = '0' + decimal;
8633 }
8634 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008635 modified = 1;
Victor Stinnere6abb482012-05-02 01:15:40 +02008636 maxchar = MAX_MAXCHAR(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008637 PyUnicode_WRITE(kind, data, i, fixed);
8638 }
Victor Stinnere6abb482012-05-02 01:15:40 +02008639 else
8640 maxchar = MAX_MAXCHAR(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008641 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008642 }
8643
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008644 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008645}
8646
8647PyObject *
8648_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8649{
8650 if (!PyUnicode_Check(unicode)) {
8651 PyErr_BadInternalCall();
8652 return NULL;
8653 }
8654 if (PyUnicode_READY(unicode) == -1)
8655 return NULL;
8656 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8657 /* If the string is already ASCII, just return the same string */
8658 Py_INCREF(unicode);
8659 return unicode;
8660 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008661 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008662}
8663
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008664PyObject *
8665PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8666 Py_ssize_t length)
8667{
Victor Stinnerf0124502011-11-21 23:12:56 +01008668 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008669 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008670 Py_UCS4 maxchar;
8671 enum PyUnicode_Kind kind;
8672 void *data;
8673
Victor Stinner99d7ad02012-02-22 13:37:39 +01008674 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008675 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008676 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008677 if (ch > 127) {
8678 int decimal = Py_UNICODE_TODECIMAL(ch);
8679 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008680 ch = '0' + decimal;
Victor Stinnere6abb482012-05-02 01:15:40 +02008681 maxchar = MAX_MAXCHAR(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008682 }
8683 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008684
8685 /* Copy to a new string */
8686 decimal = PyUnicode_New(length, maxchar);
8687 if (decimal == NULL)
8688 return decimal;
8689 kind = PyUnicode_KIND(decimal);
8690 data = PyUnicode_DATA(decimal);
8691 /* Iterate over code points */
8692 for (i = 0; i < length; i++) {
8693 Py_UNICODE ch = s[i];
8694 if (ch > 127) {
8695 int decimal = Py_UNICODE_TODECIMAL(ch);
8696 if (decimal >= 0)
8697 ch = '0' + decimal;
8698 }
8699 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008700 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008701 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008702}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008703/* --- Decimal Encoder ---------------------------------------------------- */
8704
Alexander Belopolsky40018472011-02-26 01:02:56 +00008705int
8706PyUnicode_EncodeDecimal(Py_UNICODE *s,
8707 Py_ssize_t length,
8708 char *output,
8709 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008710{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008711 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008712 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008713 enum PyUnicode_Kind kind;
8714 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008715
8716 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008717 PyErr_BadArgument();
8718 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008719 }
8720
Victor Stinner42bf7752011-11-21 22:52:58 +01008721 unicode = PyUnicode_FromUnicode(s, length);
8722 if (unicode == NULL)
8723 return -1;
8724
Benjamin Petersonbac79492012-01-14 13:34:47 -05008725 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008726 Py_DECREF(unicode);
8727 return -1;
8728 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008729 kind = PyUnicode_KIND(unicode);
8730 data = PyUnicode_DATA(unicode);
8731
Victor Stinnerb84d7232011-11-22 01:50:07 +01008732 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008733 PyObject *exc;
8734 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008735 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008736 Py_ssize_t startpos;
8737
8738 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008739
Benjamin Peterson29060642009-01-31 22:14:21 +00008740 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008741 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008742 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008743 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008744 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008745 decimal = Py_UNICODE_TODECIMAL(ch);
8746 if (decimal >= 0) {
8747 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008748 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008749 continue;
8750 }
8751 if (0 < ch && ch < 256) {
8752 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008753 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008754 continue;
8755 }
Victor Stinner6345be92011-11-25 20:09:01 +01008756
Victor Stinner42bf7752011-11-21 22:52:58 +01008757 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008758 exc = NULL;
8759 raise_encode_exception(&exc, "decimal", unicode,
8760 startpos, startpos+1,
8761 "invalid decimal Unicode string");
8762 Py_XDECREF(exc);
8763 Py_DECREF(unicode);
8764 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008765 }
8766 /* 0-terminate the output string */
8767 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008768 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008769 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008770}
8771
Guido van Rossumd57fd912000-03-10 22:53:23 +00008772/* --- Helpers ------------------------------------------------------------ */
8773
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008774static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008775any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008776 Py_ssize_t start,
8777 Py_ssize_t end)
8778{
8779 int kind1, kind2, kind;
8780 void *buf1, *buf2;
8781 Py_ssize_t len1, len2, result;
8782
8783 kind1 = PyUnicode_KIND(s1);
8784 kind2 = PyUnicode_KIND(s2);
8785 kind = kind1 > kind2 ? kind1 : kind2;
8786 buf1 = PyUnicode_DATA(s1);
8787 buf2 = PyUnicode_DATA(s2);
8788 if (kind1 != kind)
8789 buf1 = _PyUnicode_AsKind(s1, kind);
8790 if (!buf1)
8791 return -2;
8792 if (kind2 != kind)
8793 buf2 = _PyUnicode_AsKind(s2, kind);
8794 if (!buf2) {
8795 if (kind1 != kind) PyMem_Free(buf1);
8796 return -2;
8797 }
8798 len1 = PyUnicode_GET_LENGTH(s1);
8799 len2 = PyUnicode_GET_LENGTH(s2);
8800
Victor Stinner794d5672011-10-10 03:21:36 +02008801 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008802 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008803 case PyUnicode_1BYTE_KIND:
8804 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8805 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8806 else
8807 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8808 break;
8809 case PyUnicode_2BYTE_KIND:
8810 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8811 break;
8812 case PyUnicode_4BYTE_KIND:
8813 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8814 break;
8815 default:
8816 assert(0); result = -2;
8817 }
8818 }
8819 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008820 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008821 case PyUnicode_1BYTE_KIND:
8822 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8823 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8824 else
8825 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8826 break;
8827 case PyUnicode_2BYTE_KIND:
8828 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8829 break;
8830 case PyUnicode_4BYTE_KIND:
8831 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8832 break;
8833 default:
8834 assert(0); result = -2;
8835 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008836 }
8837
8838 if (kind1 != kind)
8839 PyMem_Free(buf1);
8840 if (kind2 != kind)
8841 PyMem_Free(buf2);
8842
8843 return result;
8844}
8845
8846Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01008847_PyUnicode_InsertThousandsGrouping(
8848 PyObject *unicode, Py_ssize_t index,
8849 Py_ssize_t n_buffer,
8850 void *digits, Py_ssize_t n_digits,
8851 Py_ssize_t min_width,
8852 const char *grouping, PyObject *thousands_sep,
8853 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008854{
Victor Stinner41a863c2012-02-24 00:37:51 +01008855 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008856 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01008857 Py_ssize_t thousands_sep_len;
8858 Py_ssize_t len;
8859
8860 if (unicode != NULL) {
8861 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008862 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01008863 }
8864 else {
8865 kind = PyUnicode_1BYTE_KIND;
8866 data = NULL;
8867 }
8868 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
8869 thousands_sep_data = PyUnicode_DATA(thousands_sep);
8870 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
8871 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01008872 if (thousands_sep_kind < kind) {
8873 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
8874 if (!thousands_sep_data)
8875 return -1;
8876 }
8877 else {
8878 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
8879 if (!data)
8880 return -1;
8881 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008882 }
8883
Benjamin Petersonead6b532011-12-20 17:23:42 -06008884 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008885 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008886 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01008887 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008888 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008889 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008890 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02008891 else
Victor Stinner41a863c2012-02-24 00:37:51 +01008892 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02008893 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008894 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008895 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008896 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008897 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008898 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008899 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008900 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008901 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008902 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008903 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008904 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008905 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008906 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008907 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008908 break;
8909 default:
8910 assert(0);
8911 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008912 }
Victor Stinner90f50d42012-02-24 01:44:47 +01008913 if (unicode != NULL && thousands_sep_kind != kind) {
8914 if (thousands_sep_kind < kind)
8915 PyMem_Free(thousands_sep_data);
8916 else
8917 PyMem_Free(data);
8918 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008919 if (unicode == NULL) {
8920 *maxchar = 127;
8921 if (len != n_digits) {
Victor Stinnere6abb482012-05-02 01:15:40 +02008922 *maxchar = MAX_MAXCHAR(*maxchar,
8923 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01008924 }
8925 }
8926 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008927}
8928
8929
Thomas Wouters477c8d52006-05-27 19:21:47 +00008930/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008931#define ADJUST_INDICES(start, end, len) \
8932 if (end > len) \
8933 end = len; \
8934 else if (end < 0) { \
8935 end += len; \
8936 if (end < 0) \
8937 end = 0; \
8938 } \
8939 if (start < 0) { \
8940 start += len; \
8941 if (start < 0) \
8942 start = 0; \
8943 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008944
Alexander Belopolsky40018472011-02-26 01:02:56 +00008945Py_ssize_t
8946PyUnicode_Count(PyObject *str,
8947 PyObject *substr,
8948 Py_ssize_t start,
8949 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008950{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008951 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008952 PyObject* str_obj;
8953 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008954 int kind1, kind2, kind;
8955 void *buf1 = NULL, *buf2 = NULL;
8956 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008957
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008958 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008959 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008960 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008961 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008962 if (!sub_obj) {
8963 Py_DECREF(str_obj);
8964 return -1;
8965 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06008966 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06008967 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008968 Py_DECREF(str_obj);
8969 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008970 }
Tim Petersced69f82003-09-16 20:30:58 +00008971
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008972 kind1 = PyUnicode_KIND(str_obj);
8973 kind2 = PyUnicode_KIND(sub_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008974 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008975 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008976 buf2 = PyUnicode_DATA(sub_obj);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008977 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +02008978 if (kind2 > kind) {
8979 Py_DECREF(sub_obj);
8980 Py_DECREF(str_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008981 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +02008982 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01008983 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008984 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008985 if (!buf2)
8986 goto onError;
8987 len1 = PyUnicode_GET_LENGTH(str_obj);
8988 len2 = PyUnicode_GET_LENGTH(sub_obj);
8989
8990 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06008991 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008992 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008993 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
8994 result = asciilib_count(
8995 ((Py_UCS1*)buf1) + start, end - start,
8996 buf2, len2, PY_SSIZE_T_MAX
8997 );
8998 else
8999 result = ucs1lib_count(
9000 ((Py_UCS1*)buf1) + start, end - start,
9001 buf2, len2, PY_SSIZE_T_MAX
9002 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009003 break;
9004 case PyUnicode_2BYTE_KIND:
9005 result = ucs2lib_count(
9006 ((Py_UCS2*)buf1) + start, end - start,
9007 buf2, len2, PY_SSIZE_T_MAX
9008 );
9009 break;
9010 case PyUnicode_4BYTE_KIND:
9011 result = ucs4lib_count(
9012 ((Py_UCS4*)buf1) + start, end - start,
9013 buf2, len2, PY_SSIZE_T_MAX
9014 );
9015 break;
9016 default:
9017 assert(0); result = 0;
9018 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009019
9020 Py_DECREF(sub_obj);
9021 Py_DECREF(str_obj);
9022
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009023 if (kind2 != kind)
9024 PyMem_Free(buf2);
9025
Guido van Rossumd57fd912000-03-10 22:53:23 +00009026 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009027 onError:
9028 Py_DECREF(sub_obj);
9029 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009030 if (kind2 != kind && buf2)
9031 PyMem_Free(buf2);
9032 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009033}
9034
Alexander Belopolsky40018472011-02-26 01:02:56 +00009035Py_ssize_t
9036PyUnicode_Find(PyObject *str,
9037 PyObject *sub,
9038 Py_ssize_t start,
9039 Py_ssize_t end,
9040 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009041{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009042 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009043
Guido van Rossumd57fd912000-03-10 22:53:23 +00009044 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009045 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009046 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009047 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009048 if (!sub) {
9049 Py_DECREF(str);
9050 return -2;
9051 }
9052 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9053 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009054 Py_DECREF(str);
9055 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009056 }
Tim Petersced69f82003-09-16 20:30:58 +00009057
Victor Stinner794d5672011-10-10 03:21:36 +02009058 result = any_find_slice(direction,
9059 str, sub, start, end
9060 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009061
Guido van Rossumd57fd912000-03-10 22:53:23 +00009062 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009063 Py_DECREF(sub);
9064
Guido van Rossumd57fd912000-03-10 22:53:23 +00009065 return result;
9066}
9067
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009068Py_ssize_t
9069PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9070 Py_ssize_t start, Py_ssize_t end,
9071 int direction)
9072{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009073 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009074 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009075 if (PyUnicode_READY(str) == -1)
9076 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009077 if (start < 0 || end < 0) {
9078 PyErr_SetString(PyExc_IndexError, "string index out of range");
9079 return -2;
9080 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009081 if (end > PyUnicode_GET_LENGTH(str))
9082 end = PyUnicode_GET_LENGTH(str);
9083 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009084 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9085 kind, end-start, ch, direction);
9086 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009087 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009088 else
9089 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009090}
9091
Alexander Belopolsky40018472011-02-26 01:02:56 +00009092static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009093tailmatch(PyObject *self,
9094 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009095 Py_ssize_t start,
9096 Py_ssize_t end,
9097 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009098{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009099 int kind_self;
9100 int kind_sub;
9101 void *data_self;
9102 void *data_sub;
9103 Py_ssize_t offset;
9104 Py_ssize_t i;
9105 Py_ssize_t end_sub;
9106
9107 if (PyUnicode_READY(self) == -1 ||
9108 PyUnicode_READY(substring) == -1)
9109 return 0;
9110
9111 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009112 return 1;
9113
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009114 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9115 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009116 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009117 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009118
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009119 kind_self = PyUnicode_KIND(self);
9120 data_self = PyUnicode_DATA(self);
9121 kind_sub = PyUnicode_KIND(substring);
9122 data_sub = PyUnicode_DATA(substring);
9123 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9124
9125 if (direction > 0)
9126 offset = end;
9127 else
9128 offset = start;
9129
9130 if (PyUnicode_READ(kind_self, data_self, offset) ==
9131 PyUnicode_READ(kind_sub, data_sub, 0) &&
9132 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9133 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9134 /* If both are of the same kind, memcmp is sufficient */
9135 if (kind_self == kind_sub) {
9136 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009137 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009138 data_sub,
9139 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009140 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009141 }
9142 /* otherwise we have to compare each character by first accesing it */
9143 else {
9144 /* We do not need to compare 0 and len(substring)-1 because
9145 the if statement above ensured already that they are equal
9146 when we end up here. */
9147 // TODO: honor direction and do a forward or backwards search
9148 for (i = 1; i < end_sub; ++i) {
9149 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9150 PyUnicode_READ(kind_sub, data_sub, i))
9151 return 0;
9152 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009153 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009154 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009155 }
9156
9157 return 0;
9158}
9159
Alexander Belopolsky40018472011-02-26 01:02:56 +00009160Py_ssize_t
9161PyUnicode_Tailmatch(PyObject *str,
9162 PyObject *substr,
9163 Py_ssize_t start,
9164 Py_ssize_t end,
9165 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009166{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009167 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009168
Guido van Rossumd57fd912000-03-10 22:53:23 +00009169 str = PyUnicode_FromObject(str);
9170 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009171 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009172 substr = PyUnicode_FromObject(substr);
9173 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009174 Py_DECREF(str);
9175 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009176 }
Tim Petersced69f82003-09-16 20:30:58 +00009177
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009178 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009179 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009180 Py_DECREF(str);
9181 Py_DECREF(substr);
9182 return result;
9183}
9184
Guido van Rossumd57fd912000-03-10 22:53:23 +00009185/* Apply fixfct filter to the Unicode object self and return a
9186 reference to the modified object */
9187
Alexander Belopolsky40018472011-02-26 01:02:56 +00009188static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009189fixup(PyObject *self,
9190 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009191{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009192 PyObject *u;
9193 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009194 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009195
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009196 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009197 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009198 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009199 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009200
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009201 /* fix functions return the new maximum character in a string,
9202 if the kind of the resulting unicode object does not change,
9203 everything is fine. Otherwise we need to change the string kind
9204 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009205 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009206
9207 if (maxchar_new == 0) {
9208 /* no changes */;
9209 if (PyUnicode_CheckExact(self)) {
9210 Py_DECREF(u);
9211 Py_INCREF(self);
9212 return self;
9213 }
9214 else
9215 return u;
9216 }
9217
Victor Stinnere6abb482012-05-02 01:15:40 +02009218 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009219
Victor Stinnereaab6042011-12-11 22:22:39 +01009220 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009221 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009222
9223 /* In case the maximum character changed, we need to
9224 convert the string to the new category. */
9225 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9226 if (v == NULL) {
9227 Py_DECREF(u);
9228 return NULL;
9229 }
9230 if (maxchar_new > maxchar_old) {
9231 /* If the maxchar increased so that the kind changed, not all
9232 characters are representable anymore and we need to fix the
9233 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009234 _PyUnicode_FastCopyCharacters(v, 0,
9235 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009236 maxchar_old = fixfct(v);
9237 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009238 }
9239 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009240 _PyUnicode_FastCopyCharacters(v, 0,
9241 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009242 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009243 Py_DECREF(u);
9244 assert(_PyUnicode_CheckConsistency(v, 1));
9245 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009246}
9247
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009248static PyObject *
9249ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009250{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009251 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9252 char *resdata, *data = PyUnicode_DATA(self);
9253 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009254
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009255 res = PyUnicode_New(len, 127);
9256 if (res == NULL)
9257 return NULL;
9258 resdata = PyUnicode_DATA(res);
9259 if (lower)
9260 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009261 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009262 _Py_bytes_upper(resdata, data, len);
9263 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009264}
9265
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009266static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009267handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009268{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009269 Py_ssize_t j;
9270 int final_sigma;
9271 Py_UCS4 c;
9272 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009273
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009274 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9275
9276 where ! is a negation and \p{xxx} is a character with property xxx.
9277 */
9278 for (j = i - 1; j >= 0; j--) {
9279 c = PyUnicode_READ(kind, data, j);
9280 if (!_PyUnicode_IsCaseIgnorable(c))
9281 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009282 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009283 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9284 if (final_sigma) {
9285 for (j = i + 1; j < length; j++) {
9286 c = PyUnicode_READ(kind, data, j);
9287 if (!_PyUnicode_IsCaseIgnorable(c))
9288 break;
9289 }
9290 final_sigma = j == length || !_PyUnicode_IsCased(c);
9291 }
9292 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009293}
9294
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009295static int
9296lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9297 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009298{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009299 /* Obscure special case. */
9300 if (c == 0x3A3) {
9301 mapped[0] = handle_capital_sigma(kind, data, length, i);
9302 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009303 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009304 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009305}
9306
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009307static Py_ssize_t
9308do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009309{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009310 Py_ssize_t i, k = 0;
9311 int n_res, j;
9312 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009313
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009314 c = PyUnicode_READ(kind, data, 0);
9315 n_res = _PyUnicode_ToUpperFull(c, mapped);
9316 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009317 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009318 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009319 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009320 for (i = 1; i < length; i++) {
9321 c = PyUnicode_READ(kind, data, i);
9322 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9323 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009324 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009325 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009326 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009327 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009328 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009329}
9330
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009331static Py_ssize_t
9332do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9333 Py_ssize_t i, k = 0;
9334
9335 for (i = 0; i < length; i++) {
9336 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9337 int n_res, j;
9338 if (Py_UNICODE_ISUPPER(c)) {
9339 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9340 }
9341 else if (Py_UNICODE_ISLOWER(c)) {
9342 n_res = _PyUnicode_ToUpperFull(c, mapped);
9343 }
9344 else {
9345 n_res = 1;
9346 mapped[0] = c;
9347 }
9348 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009349 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009350 res[k++] = mapped[j];
9351 }
9352 }
9353 return k;
9354}
9355
9356static Py_ssize_t
9357do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9358 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009359{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009360 Py_ssize_t i, k = 0;
9361
9362 for (i = 0; i < length; i++) {
9363 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9364 int n_res, j;
9365 if (lower)
9366 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9367 else
9368 n_res = _PyUnicode_ToUpperFull(c, mapped);
9369 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009370 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009371 res[k++] = mapped[j];
9372 }
9373 }
9374 return k;
9375}
9376
9377static Py_ssize_t
9378do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9379{
9380 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9381}
9382
9383static Py_ssize_t
9384do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9385{
9386 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9387}
9388
Benjamin Petersone51757f2012-01-12 21:10:29 -05009389static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009390do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9391{
9392 Py_ssize_t i, k = 0;
9393
9394 for (i = 0; i < length; i++) {
9395 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9396 Py_UCS4 mapped[3];
9397 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9398 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009399 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009400 res[k++] = mapped[j];
9401 }
9402 }
9403 return k;
9404}
9405
9406static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009407do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9408{
9409 Py_ssize_t i, k = 0;
9410 int previous_is_cased;
9411
9412 previous_is_cased = 0;
9413 for (i = 0; i < length; i++) {
9414 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9415 Py_UCS4 mapped[3];
9416 int n_res, j;
9417
9418 if (previous_is_cased)
9419 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9420 else
9421 n_res = _PyUnicode_ToTitleFull(c, mapped);
9422
9423 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009424 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009425 res[k++] = mapped[j];
9426 }
9427
9428 previous_is_cased = _PyUnicode_IsCased(c);
9429 }
9430 return k;
9431}
9432
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009433static PyObject *
9434case_operation(PyObject *self,
9435 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9436{
9437 PyObject *res = NULL;
9438 Py_ssize_t length, newlength = 0;
9439 int kind, outkind;
9440 void *data, *outdata;
9441 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9442
Benjamin Petersoneea48462012-01-16 14:28:50 -05009443 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009444
9445 kind = PyUnicode_KIND(self);
9446 data = PyUnicode_DATA(self);
9447 length = PyUnicode_GET_LENGTH(self);
9448 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9449 if (tmp == NULL)
9450 return PyErr_NoMemory();
9451 newlength = perform(kind, data, length, tmp, &maxchar);
9452 res = PyUnicode_New(newlength, maxchar);
9453 if (res == NULL)
9454 goto leave;
9455 tmpend = tmp + newlength;
9456 outdata = PyUnicode_DATA(res);
9457 outkind = PyUnicode_KIND(res);
9458 switch (outkind) {
9459 case PyUnicode_1BYTE_KIND:
9460 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9461 break;
9462 case PyUnicode_2BYTE_KIND:
9463 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9464 break;
9465 case PyUnicode_4BYTE_KIND:
9466 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9467 break;
9468 default:
9469 assert(0);
9470 break;
9471 }
9472 leave:
9473 PyMem_FREE(tmp);
9474 return res;
9475}
9476
Tim Peters8ce9f162004-08-27 01:49:32 +00009477PyObject *
9478PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009479{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009480 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009481 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009482 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009483 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009484 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9485 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009486 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009487 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009488 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009489 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009490 int use_memcpy;
9491 unsigned char *res_data = NULL, *sep_data = NULL;
9492 PyObject *last_obj;
9493 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009494
Tim Peters05eba1f2004-08-27 21:32:02 +00009495 fseq = PySequence_Fast(seq, "");
9496 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009497 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009498 }
9499
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009500 /* NOTE: the following code can't call back into Python code,
9501 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009502 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009503
Tim Peters05eba1f2004-08-27 21:32:02 +00009504 seqlen = PySequence_Fast_GET_SIZE(fseq);
9505 /* If empty sequence, return u"". */
9506 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009507 Py_DECREF(fseq);
9508 Py_INCREF(unicode_empty);
9509 res = unicode_empty;
9510 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009511 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009512
Tim Peters05eba1f2004-08-27 21:32:02 +00009513 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009514 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009515 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009516 if (seqlen == 1) {
9517 if (PyUnicode_CheckExact(items[0])) {
9518 res = items[0];
9519 Py_INCREF(res);
9520 Py_DECREF(fseq);
9521 return res;
9522 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009523 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009524 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009525 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009526 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009527 /* Set up sep and seplen */
9528 if (separator == NULL) {
9529 /* fall back to a blank space separator */
9530 sep = PyUnicode_FromOrdinal(' ');
9531 if (!sep)
9532 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009533 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009534 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009535 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009536 else {
9537 if (!PyUnicode_Check(separator)) {
9538 PyErr_Format(PyExc_TypeError,
9539 "separator: expected str instance,"
9540 " %.80s found",
9541 Py_TYPE(separator)->tp_name);
9542 goto onError;
9543 }
9544 if (PyUnicode_READY(separator))
9545 goto onError;
9546 sep = separator;
9547 seplen = PyUnicode_GET_LENGTH(separator);
9548 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9549 /* inc refcount to keep this code path symmetric with the
9550 above case of a blank separator */
9551 Py_INCREF(sep);
9552 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009553 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009554 }
9555
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009556 /* There are at least two things to join, or else we have a subclass
9557 * of str in the sequence.
9558 * Do a pre-pass to figure out the total amount of space we'll
9559 * need (sz), and see whether all argument are strings.
9560 */
9561 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009562#ifdef Py_DEBUG
9563 use_memcpy = 0;
9564#else
9565 use_memcpy = 1;
9566#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009567 for (i = 0; i < seqlen; i++) {
9568 const Py_ssize_t old_sz = sz;
9569 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009570 if (!PyUnicode_Check(item)) {
9571 PyErr_Format(PyExc_TypeError,
9572 "sequence item %zd: expected str instance,"
9573 " %.80s found",
9574 i, Py_TYPE(item)->tp_name);
9575 goto onError;
9576 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009577 if (PyUnicode_READY(item) == -1)
9578 goto onError;
9579 sz += PyUnicode_GET_LENGTH(item);
9580 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnere6abb482012-05-02 01:15:40 +02009581 maxchar = MAX_MAXCHAR(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009582 if (i != 0)
9583 sz += seplen;
9584 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9585 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009586 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009587 goto onError;
9588 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009589 if (use_memcpy && last_obj != NULL) {
9590 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9591 use_memcpy = 0;
9592 }
9593 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009594 }
Tim Petersced69f82003-09-16 20:30:58 +00009595
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009596 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009597 if (res == NULL)
9598 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009599
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009600 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009601#ifdef Py_DEBUG
9602 use_memcpy = 0;
9603#else
9604 if (use_memcpy) {
9605 res_data = PyUnicode_1BYTE_DATA(res);
9606 kind = PyUnicode_KIND(res);
9607 if (seplen != 0)
9608 sep_data = PyUnicode_1BYTE_DATA(sep);
9609 }
9610#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009611 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009612 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009613 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009614 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009615 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009616 if (use_memcpy) {
9617 Py_MEMCPY(res_data,
9618 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009619 kind * seplen);
9620 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009621 }
9622 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009623 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009624 res_offset += seplen;
9625 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009626 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009627 itemlen = PyUnicode_GET_LENGTH(item);
9628 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009629 if (use_memcpy) {
9630 Py_MEMCPY(res_data,
9631 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009632 kind * itemlen);
9633 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009634 }
9635 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009636 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009637 res_offset += itemlen;
9638 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009639 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009640 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009641 if (use_memcpy)
9642 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009643 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009644 else
9645 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009646
Tim Peters05eba1f2004-08-27 21:32:02 +00009647 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009648 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009649 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009650 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009651
Benjamin Peterson29060642009-01-31 22:14:21 +00009652 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009653 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009654 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009655 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009656 return NULL;
9657}
9658
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009659#define FILL(kind, data, value, start, length) \
9660 do { \
9661 Py_ssize_t i_ = 0; \
9662 assert(kind != PyUnicode_WCHAR_KIND); \
9663 switch ((kind)) { \
9664 case PyUnicode_1BYTE_KIND: { \
9665 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +02009666 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009667 break; \
9668 } \
9669 case PyUnicode_2BYTE_KIND: { \
9670 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9671 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9672 break; \
9673 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009674 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009675 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9676 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9677 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009678 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009679 } \
9680 } \
9681 } while (0)
9682
Victor Stinnerd3f08822012-05-29 12:57:52 +02009683void
9684_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9685 Py_UCS4 fill_char)
9686{
9687 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9688 const void *data = PyUnicode_DATA(unicode);
9689 assert(PyUnicode_IS_READY(unicode));
9690 assert(unicode_modifiable(unicode));
9691 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9692 assert(start >= 0);
9693 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9694 FILL(kind, data, fill_char, start, length);
9695}
9696
Victor Stinner3fe55312012-01-04 00:33:50 +01009697Py_ssize_t
9698PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9699 Py_UCS4 fill_char)
9700{
9701 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +01009702
9703 if (!PyUnicode_Check(unicode)) {
9704 PyErr_BadInternalCall();
9705 return -1;
9706 }
9707 if (PyUnicode_READY(unicode) == -1)
9708 return -1;
9709 if (unicode_check_modifiable(unicode))
9710 return -1;
9711
Victor Stinnerd3f08822012-05-29 12:57:52 +02009712 if (start < 0) {
9713 PyErr_SetString(PyExc_IndexError, "string index out of range");
9714 return -1;
9715 }
Victor Stinner3fe55312012-01-04 00:33:50 +01009716 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9717 PyErr_SetString(PyExc_ValueError,
9718 "fill character is bigger than "
9719 "the string maximum character");
9720 return -1;
9721 }
9722
9723 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9724 length = Py_MIN(maxlen, length);
9725 if (length <= 0)
9726 return 0;
9727
Victor Stinnerd3f08822012-05-29 12:57:52 +02009728 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +01009729 return length;
9730}
9731
Victor Stinner9310abb2011-10-05 00:59:23 +02009732static PyObject *
9733pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009734 Py_ssize_t left,
9735 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009736 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009737{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009738 PyObject *u;
9739 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009740 int kind;
9741 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009742
9743 if (left < 0)
9744 left = 0;
9745 if (right < 0)
9746 right = 0;
9747
Victor Stinnerc4b49542011-12-11 22:44:26 +01009748 if (left == 0 && right == 0)
9749 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009750
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009751 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9752 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009753 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9754 return NULL;
9755 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009756 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02009757 maxchar = MAX_MAXCHAR(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009758 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009759 if (!u)
9760 return NULL;
9761
9762 kind = PyUnicode_KIND(u);
9763 data = PyUnicode_DATA(u);
9764 if (left)
9765 FILL(kind, data, fill, 0, left);
9766 if (right)
9767 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +02009768 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009769 assert(_PyUnicode_CheckConsistency(u, 1));
9770 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009771}
9772
Alexander Belopolsky40018472011-02-26 01:02:56 +00009773PyObject *
9774PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009775{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009776 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009777
9778 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009779 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009780 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -06009781 if (PyUnicode_READY(string) == -1) {
9782 Py_DECREF(string);
9783 return NULL;
9784 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009785
Benjamin Petersonead6b532011-12-20 17:23:42 -06009786 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009787 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009788 if (PyUnicode_IS_ASCII(string))
9789 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009790 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009791 PyUnicode_GET_LENGTH(string), keepends);
9792 else
9793 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009794 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009795 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009796 break;
9797 case PyUnicode_2BYTE_KIND:
9798 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009799 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009800 PyUnicode_GET_LENGTH(string), keepends);
9801 break;
9802 case PyUnicode_4BYTE_KIND:
9803 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009804 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009805 PyUnicode_GET_LENGTH(string), keepends);
9806 break;
9807 default:
9808 assert(0);
9809 list = 0;
9810 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009811 Py_DECREF(string);
9812 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009813}
9814
Alexander Belopolsky40018472011-02-26 01:02:56 +00009815static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009816split(PyObject *self,
9817 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009818 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009819{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009820 int kind1, kind2, kind;
9821 void *buf1, *buf2;
9822 Py_ssize_t len1, len2;
9823 PyObject* out;
9824
Guido van Rossumd57fd912000-03-10 22:53:23 +00009825 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009826 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009827
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009828 if (PyUnicode_READY(self) == -1)
9829 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009830
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009831 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009832 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009833 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009834 if (PyUnicode_IS_ASCII(self))
9835 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009836 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009837 PyUnicode_GET_LENGTH(self), maxcount
9838 );
9839 else
9840 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009841 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009842 PyUnicode_GET_LENGTH(self), maxcount
9843 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009844 case PyUnicode_2BYTE_KIND:
9845 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009846 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009847 PyUnicode_GET_LENGTH(self), maxcount
9848 );
9849 case PyUnicode_4BYTE_KIND:
9850 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009851 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009852 PyUnicode_GET_LENGTH(self), maxcount
9853 );
9854 default:
9855 assert(0);
9856 return NULL;
9857 }
9858
9859 if (PyUnicode_READY(substring) == -1)
9860 return NULL;
9861
9862 kind1 = PyUnicode_KIND(self);
9863 kind2 = PyUnicode_KIND(substring);
9864 kind = kind1 > kind2 ? kind1 : kind2;
9865 buf1 = PyUnicode_DATA(self);
9866 buf2 = PyUnicode_DATA(substring);
9867 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009868 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009869 if (!buf1)
9870 return NULL;
9871 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009872 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009873 if (!buf2) {
9874 if (kind1 != kind) PyMem_Free(buf1);
9875 return NULL;
9876 }
9877 len1 = PyUnicode_GET_LENGTH(self);
9878 len2 = PyUnicode_GET_LENGTH(substring);
9879
Benjamin Petersonead6b532011-12-20 17:23:42 -06009880 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009881 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009882 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9883 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009884 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009885 else
9886 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009887 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009888 break;
9889 case PyUnicode_2BYTE_KIND:
9890 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009891 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009892 break;
9893 case PyUnicode_4BYTE_KIND:
9894 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009895 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009896 break;
9897 default:
9898 out = NULL;
9899 }
9900 if (kind1 != kind)
9901 PyMem_Free(buf1);
9902 if (kind2 != kind)
9903 PyMem_Free(buf2);
9904 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009905}
9906
Alexander Belopolsky40018472011-02-26 01:02:56 +00009907static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009908rsplit(PyObject *self,
9909 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009910 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009911{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009912 int kind1, kind2, kind;
9913 void *buf1, *buf2;
9914 Py_ssize_t len1, len2;
9915 PyObject* out;
9916
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009917 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009918 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009919
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009920 if (PyUnicode_READY(self) == -1)
9921 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009922
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009923 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009924 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009925 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009926 if (PyUnicode_IS_ASCII(self))
9927 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009928 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009929 PyUnicode_GET_LENGTH(self), maxcount
9930 );
9931 else
9932 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009933 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009934 PyUnicode_GET_LENGTH(self), maxcount
9935 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009936 case PyUnicode_2BYTE_KIND:
9937 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009938 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009939 PyUnicode_GET_LENGTH(self), maxcount
9940 );
9941 case PyUnicode_4BYTE_KIND:
9942 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009943 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009944 PyUnicode_GET_LENGTH(self), maxcount
9945 );
9946 default:
9947 assert(0);
9948 return NULL;
9949 }
9950
9951 if (PyUnicode_READY(substring) == -1)
9952 return NULL;
9953
9954 kind1 = PyUnicode_KIND(self);
9955 kind2 = PyUnicode_KIND(substring);
9956 kind = kind1 > kind2 ? kind1 : kind2;
9957 buf1 = PyUnicode_DATA(self);
9958 buf2 = PyUnicode_DATA(substring);
9959 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009960 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009961 if (!buf1)
9962 return NULL;
9963 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009964 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009965 if (!buf2) {
9966 if (kind1 != kind) PyMem_Free(buf1);
9967 return NULL;
9968 }
9969 len1 = PyUnicode_GET_LENGTH(self);
9970 len2 = PyUnicode_GET_LENGTH(substring);
9971
Benjamin Petersonead6b532011-12-20 17:23:42 -06009972 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009973 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009974 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9975 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009976 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009977 else
9978 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009979 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009980 break;
9981 case PyUnicode_2BYTE_KIND:
9982 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009983 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009984 break;
9985 case PyUnicode_4BYTE_KIND:
9986 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009987 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009988 break;
9989 default:
9990 out = NULL;
9991 }
9992 if (kind1 != kind)
9993 PyMem_Free(buf1);
9994 if (kind2 != kind)
9995 PyMem_Free(buf2);
9996 return out;
9997}
9998
9999static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010000anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10001 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010002{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010003 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010004 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010005 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10006 return asciilib_find(buf1, len1, buf2, len2, offset);
10007 else
10008 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010009 case PyUnicode_2BYTE_KIND:
10010 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10011 case PyUnicode_4BYTE_KIND:
10012 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10013 }
10014 assert(0);
10015 return -1;
10016}
10017
10018static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010019anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10020 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010021{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010022 switch (kind) {
10023 case PyUnicode_1BYTE_KIND:
10024 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10025 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10026 else
10027 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10028 case PyUnicode_2BYTE_KIND:
10029 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10030 case PyUnicode_4BYTE_KIND:
10031 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10032 }
10033 assert(0);
10034 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010035}
10036
Alexander Belopolsky40018472011-02-26 01:02:56 +000010037static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010038replace(PyObject *self, PyObject *str1,
10039 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010040{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010041 PyObject *u;
10042 char *sbuf = PyUnicode_DATA(self);
10043 char *buf1 = PyUnicode_DATA(str1);
10044 char *buf2 = PyUnicode_DATA(str2);
10045 int srelease = 0, release1 = 0, release2 = 0;
10046 int skind = PyUnicode_KIND(self);
10047 int kind1 = PyUnicode_KIND(str1);
10048 int kind2 = PyUnicode_KIND(str2);
10049 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10050 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10051 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010052 int mayshrink;
10053 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010054
10055 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010056 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010057 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010058 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010059
Victor Stinner59de0ee2011-10-07 10:01:28 +020010060 if (str1 == str2)
10061 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010062 if (skind < kind1)
10063 /* substring too wide to be present */
10064 goto nothing;
10065
Victor Stinner49a0a212011-10-12 23:46:10 +020010066 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10067 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10068 /* Replacing str1 with str2 may cause a maxchar reduction in the
10069 result string. */
10070 mayshrink = (maxchar_str2 < maxchar);
Victor Stinnere6abb482012-05-02 01:15:40 +020010071 maxchar = MAX_MAXCHAR(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010072
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010073 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010074 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010075 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010076 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010077 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010078 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010079 Py_UCS4 u1, u2;
10080 int rkind;
Victor Stinnerf6441102011-12-18 02:43:08 +010010081 Py_ssize_t index, pos;
10082 char *src;
10083
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010084 u1 = PyUnicode_READ_CHAR(str1, 0);
Victor Stinnerf6441102011-12-18 02:43:08 +010010085 pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1);
10086 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010087 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010088 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010089 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010090 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010091 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010092 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010093 rkind = PyUnicode_KIND(u);
Victor Stinnerf6441102011-12-18 02:43:08 +010010094
10095 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2);
10096 index = 0;
10097 src = sbuf;
10098 while (--maxcount)
10099 {
10100 pos++;
10101 src += pos * PyUnicode_KIND(self);
10102 slen -= pos;
10103 index += pos;
10104 pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1);
10105 if (pos < 0)
10106 break;
10107 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2);
10108 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010109 }
10110 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010111 int rkind = skind;
10112 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010113 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010114
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010115 if (kind1 < rkind) {
10116 /* widen substring */
10117 buf1 = _PyUnicode_AsKind(str1, rkind);
10118 if (!buf1) goto error;
10119 release1 = 1;
10120 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010121 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010122 if (i < 0)
10123 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010124 if (rkind > kind2) {
10125 /* widen replacement */
10126 buf2 = _PyUnicode_AsKind(str2, rkind);
10127 if (!buf2) goto error;
10128 release2 = 1;
10129 }
10130 else if (rkind < kind2) {
10131 /* widen self and buf1 */
10132 rkind = kind2;
10133 if (release1) PyMem_Free(buf1);
10134 sbuf = _PyUnicode_AsKind(self, rkind);
10135 if (!sbuf) goto error;
10136 srelease = 1;
10137 buf1 = _PyUnicode_AsKind(str1, rkind);
10138 if (!buf1) goto error;
10139 release1 = 1;
10140 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010141 u = PyUnicode_New(slen, maxchar);
10142 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010143 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010144 assert(PyUnicode_KIND(u) == rkind);
10145 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010146
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010147 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010148 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010149 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010150 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010151 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010152 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010153
10154 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010155 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010156 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010157 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010158 if (i == -1)
10159 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010160 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010161 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010162 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010163 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010164 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010165 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010166 }
10167 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010168 Py_ssize_t n, i, j, ires;
10169 Py_ssize_t product, new_size;
10170 int rkind = skind;
10171 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010172
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010173 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010174 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010175 buf1 = _PyUnicode_AsKind(str1, rkind);
10176 if (!buf1) goto error;
10177 release1 = 1;
10178 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010179 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010180 if (n == 0)
10181 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010182 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010183 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010184 buf2 = _PyUnicode_AsKind(str2, rkind);
10185 if (!buf2) goto error;
10186 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010187 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010188 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010189 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010190 rkind = kind2;
10191 sbuf = _PyUnicode_AsKind(self, rkind);
10192 if (!sbuf) goto error;
10193 srelease = 1;
10194 if (release1) PyMem_Free(buf1);
10195 buf1 = _PyUnicode_AsKind(str1, rkind);
10196 if (!buf1) goto error;
10197 release1 = 1;
10198 }
10199 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10200 PyUnicode_GET_LENGTH(str1))); */
10201 product = n * (len2-len1);
10202 if ((product / (len2-len1)) != n) {
10203 PyErr_SetString(PyExc_OverflowError,
10204 "replace string is too long");
10205 goto error;
10206 }
10207 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +020010208 if (new_size == 0) {
10209 Py_INCREF(unicode_empty);
10210 u = unicode_empty;
10211 goto done;
10212 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010213 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10214 PyErr_SetString(PyExc_OverflowError,
10215 "replace string is too long");
10216 goto error;
10217 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010218 u = PyUnicode_New(new_size, maxchar);
10219 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010220 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010221 assert(PyUnicode_KIND(u) == rkind);
10222 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010223 ires = i = 0;
10224 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010225 while (n-- > 0) {
10226 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010227 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010228 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010229 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010230 if (j == -1)
10231 break;
10232 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010233 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010234 memcpy(res + rkind * ires,
10235 sbuf + rkind * i,
10236 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010237 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010238 }
10239 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010240 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010241 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010242 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010243 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010244 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010245 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010246 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010247 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010248 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010249 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010250 memcpy(res + rkind * ires,
10251 sbuf + rkind * i,
10252 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010253 }
10254 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010255 /* interleave */
10256 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010257 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010258 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010259 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010260 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010261 if (--n <= 0)
10262 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010263 memcpy(res + rkind * ires,
10264 sbuf + rkind * i,
10265 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010266 ires++;
10267 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010268 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010269 memcpy(res + rkind * ires,
10270 sbuf + rkind * i,
10271 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010272 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010273 }
10274
10275 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010276 unicode_adjust_maxchar(&u);
10277 if (u == NULL)
10278 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010279 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010280
10281 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010282 if (srelease)
10283 PyMem_FREE(sbuf);
10284 if (release1)
10285 PyMem_FREE(buf1);
10286 if (release2)
10287 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010288 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010289 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010290
Benjamin Peterson29060642009-01-31 22:14:21 +000010291 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010292 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010293 if (srelease)
10294 PyMem_FREE(sbuf);
10295 if (release1)
10296 PyMem_FREE(buf1);
10297 if (release2)
10298 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010299 return unicode_result_unchanged(self);
10300
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010301 error:
10302 if (srelease && sbuf)
10303 PyMem_FREE(sbuf);
10304 if (release1 && buf1)
10305 PyMem_FREE(buf1);
10306 if (release2 && buf2)
10307 PyMem_FREE(buf2);
10308 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010309}
10310
10311/* --- Unicode Object Methods --------------------------------------------- */
10312
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010313PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010314 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010315\n\
10316Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010317characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010318
10319static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010320unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010321{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010322 if (PyUnicode_READY(self) == -1)
10323 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010324 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010325}
10326
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010327PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010328 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010329\n\
10330Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010331have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010332
10333static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010334unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010335{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010336 if (PyUnicode_READY(self) == -1)
10337 return NULL;
10338 if (PyUnicode_GET_LENGTH(self) == 0)
10339 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010340 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010341}
10342
Benjamin Petersond5890c82012-01-14 13:23:30 -050010343PyDoc_STRVAR(casefold__doc__,
10344 "S.casefold() -> str\n\
10345\n\
10346Return a version of S suitable for caseless comparisons.");
10347
10348static PyObject *
10349unicode_casefold(PyObject *self)
10350{
10351 if (PyUnicode_READY(self) == -1)
10352 return NULL;
10353 if (PyUnicode_IS_ASCII(self))
10354 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010355 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010356}
10357
10358
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010359/* Argument converter. Coerces to a single unicode character */
10360
10361static int
10362convert_uc(PyObject *obj, void *addr)
10363{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010364 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010365 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010366
Benjamin Peterson14339b62009-01-31 16:36:08 +000010367 uniobj = PyUnicode_FromObject(obj);
10368 if (uniobj == NULL) {
10369 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010370 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010371 return 0;
10372 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010373 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010374 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010375 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010376 Py_DECREF(uniobj);
10377 return 0;
10378 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010379 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010380 Py_DECREF(uniobj);
10381 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010382}
10383
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010384PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010385 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010386\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010387Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010388done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010389
10390static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010391unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010392{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010393 Py_ssize_t marg, left;
10394 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010395 Py_UCS4 fillchar = ' ';
10396
Victor Stinnere9a29352011-10-01 02:14:59 +020010397 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010398 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010399
Benjamin Petersonbac79492012-01-14 13:34:47 -050010400 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010401 return NULL;
10402
Victor Stinnerc4b49542011-12-11 22:44:26 +010010403 if (PyUnicode_GET_LENGTH(self) >= width)
10404 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010405
Victor Stinnerc4b49542011-12-11 22:44:26 +010010406 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010407 left = marg / 2 + (marg & width & 1);
10408
Victor Stinner9310abb2011-10-05 00:59:23 +020010409 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010410}
10411
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010412/* This function assumes that str1 and str2 are readied by the caller. */
10413
Marc-André Lemburge5034372000-08-08 08:04:29 +000010414static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010415unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010416{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010417 int kind1, kind2;
10418 void *data1, *data2;
10419 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010420
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010421 kind1 = PyUnicode_KIND(str1);
10422 kind2 = PyUnicode_KIND(str2);
10423 data1 = PyUnicode_DATA(str1);
10424 data2 = PyUnicode_DATA(str2);
10425 len1 = PyUnicode_GET_LENGTH(str1);
10426 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010427
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010428 for (i = 0; i < len1 && i < len2; ++i) {
10429 Py_UCS4 c1, c2;
10430 c1 = PyUnicode_READ(kind1, data1, i);
10431 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010432
10433 if (c1 != c2)
10434 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010435 }
10436
10437 return (len1 < len2) ? -1 : (len1 != len2);
10438}
10439
Alexander Belopolsky40018472011-02-26 01:02:56 +000010440int
10441PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010442{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010443 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10444 if (PyUnicode_READY(left) == -1 ||
10445 PyUnicode_READY(right) == -1)
10446 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010447 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010448 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010449 PyErr_Format(PyExc_TypeError,
10450 "Can't compare %.100s and %.100s",
10451 left->ob_type->tp_name,
10452 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010453 return -1;
10454}
10455
Martin v. Löwis5b222132007-06-10 09:51:05 +000010456int
10457PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10458{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010459 Py_ssize_t i;
10460 int kind;
10461 void *data;
10462 Py_UCS4 chr;
10463
Victor Stinner910337b2011-10-03 03:20:16 +020010464 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010465 if (PyUnicode_READY(uni) == -1)
10466 return -1;
10467 kind = PyUnicode_KIND(uni);
10468 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010469 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010470 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10471 if (chr != str[i])
10472 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010473 /* This check keeps Python strings that end in '\0' from comparing equal
10474 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010475 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010476 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010477 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010478 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010479 return 0;
10480}
10481
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010482
Benjamin Peterson29060642009-01-31 22:14:21 +000010483#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010484 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010485
Alexander Belopolsky40018472011-02-26 01:02:56 +000010486PyObject *
10487PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010488{
10489 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010490
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010491 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10492 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010493 if (PyUnicode_READY(left) == -1 ||
10494 PyUnicode_READY(right) == -1)
10495 return NULL;
10496 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10497 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010498 if (op == Py_EQ) {
10499 Py_INCREF(Py_False);
10500 return Py_False;
10501 }
10502 if (op == Py_NE) {
10503 Py_INCREF(Py_True);
10504 return Py_True;
10505 }
10506 }
10507 if (left == right)
10508 result = 0;
10509 else
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010510 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010511
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010512 /* Convert the return value to a Boolean */
10513 switch (op) {
10514 case Py_EQ:
10515 v = TEST_COND(result == 0);
10516 break;
10517 case Py_NE:
10518 v = TEST_COND(result != 0);
10519 break;
10520 case Py_LE:
10521 v = TEST_COND(result <= 0);
10522 break;
10523 case Py_GE:
10524 v = TEST_COND(result >= 0);
10525 break;
10526 case Py_LT:
10527 v = TEST_COND(result == -1);
10528 break;
10529 case Py_GT:
10530 v = TEST_COND(result == 1);
10531 break;
10532 default:
10533 PyErr_BadArgument();
10534 return NULL;
10535 }
10536 Py_INCREF(v);
10537 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010538 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010539
Brian Curtindfc80e32011-08-10 20:28:54 -050010540 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010541}
10542
Alexander Belopolsky40018472011-02-26 01:02:56 +000010543int
10544PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010545{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010546 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010547 int kind1, kind2, kind;
10548 void *buf1, *buf2;
10549 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010550 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010551
10552 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010553 sub = PyUnicode_FromObject(element);
10554 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010555 PyErr_Format(PyExc_TypeError,
10556 "'in <string>' requires string as left operand, not %s",
10557 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010558 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010559 }
10560
Thomas Wouters477c8d52006-05-27 19:21:47 +000010561 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010562 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010563 Py_DECREF(sub);
10564 return -1;
10565 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060010566 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
10567 Py_DECREF(sub);
10568 Py_DECREF(str);
10569 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010570
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010571 kind1 = PyUnicode_KIND(str);
10572 kind2 = PyUnicode_KIND(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010573 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010574 buf1 = PyUnicode_DATA(str);
10575 buf2 = PyUnicode_DATA(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010576 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +020010577 if (kind2 > kind) {
10578 Py_DECREF(sub);
10579 Py_DECREF(str);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010580 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +020010581 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010010582 buf2 = _PyUnicode_AsKind(sub, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010583 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010584 if (!buf2) {
10585 Py_DECREF(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010586 Py_DECREF(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010587 return -1;
10588 }
10589 len1 = PyUnicode_GET_LENGTH(str);
10590 len2 = PyUnicode_GET_LENGTH(sub);
10591
Benjamin Petersonead6b532011-12-20 17:23:42 -060010592 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010593 case PyUnicode_1BYTE_KIND:
10594 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10595 break;
10596 case PyUnicode_2BYTE_KIND:
10597 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10598 break;
10599 case PyUnicode_4BYTE_KIND:
10600 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10601 break;
10602 default:
10603 result = -1;
10604 assert(0);
10605 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010606
10607 Py_DECREF(str);
10608 Py_DECREF(sub);
10609
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010610 if (kind2 != kind)
10611 PyMem_Free(buf2);
10612
Guido van Rossum403d68b2000-03-13 15:55:09 +000010613 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010614}
10615
Guido van Rossumd57fd912000-03-10 22:53:23 +000010616/* Concat to string or Unicode object giving a new Unicode object. */
10617
Alexander Belopolsky40018472011-02-26 01:02:56 +000010618PyObject *
10619PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010620{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010621 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010622 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010623 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010624
10625 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010626 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010627 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010628 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010629 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010630 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010631 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010632
10633 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010634 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010635 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010636 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010637 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010638 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010639 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010640 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010641 }
10642
Victor Stinner488fa492011-12-12 00:01:39 +010010643 u_len = PyUnicode_GET_LENGTH(u);
10644 v_len = PyUnicode_GET_LENGTH(v);
10645 if (u_len > PY_SSIZE_T_MAX - v_len) {
10646 PyErr_SetString(PyExc_OverflowError,
10647 "strings are too large to concat");
10648 goto onError;
10649 }
10650 new_len = u_len + v_len;
10651
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010652 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010653 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Victor Stinnere6abb482012-05-02 01:15:40 +020010654 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010655
Guido van Rossumd57fd912000-03-10 22:53:23 +000010656 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010010657 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010658 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010659 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010660 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
10661 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010662 Py_DECREF(u);
10663 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010664 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010665 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010666
Benjamin Peterson29060642009-01-31 22:14:21 +000010667 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010668 Py_XDECREF(u);
10669 Py_XDECREF(v);
10670 return NULL;
10671}
10672
Walter Dörwald1ab83302007-05-18 17:15:44 +000010673void
Victor Stinner23e56682011-10-03 03:54:37 +020010674PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010675{
Victor Stinner23e56682011-10-03 03:54:37 +020010676 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010010677 Py_UCS4 maxchar, maxchar2;
10678 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020010679
10680 if (p_left == NULL) {
10681 if (!PyErr_Occurred())
10682 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010683 return;
10684 }
Victor Stinner23e56682011-10-03 03:54:37 +020010685 left = *p_left;
10686 if (right == NULL || !PyUnicode_Check(left)) {
10687 if (!PyErr_Occurred())
10688 PyErr_BadInternalCall();
10689 goto error;
10690 }
10691
Benjamin Petersonbac79492012-01-14 13:34:47 -050010692 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010693 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050010694 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010695 goto error;
10696
Victor Stinner488fa492011-12-12 00:01:39 +010010697 /* Shortcuts */
10698 if (left == unicode_empty) {
10699 Py_DECREF(left);
10700 Py_INCREF(right);
10701 *p_left = right;
10702 return;
10703 }
10704 if (right == unicode_empty)
10705 return;
10706
10707 left_len = PyUnicode_GET_LENGTH(left);
10708 right_len = PyUnicode_GET_LENGTH(right);
10709 if (left_len > PY_SSIZE_T_MAX - right_len) {
10710 PyErr_SetString(PyExc_OverflowError,
10711 "strings are too large to concat");
10712 goto error;
10713 }
10714 new_len = left_len + right_len;
10715
10716 if (unicode_modifiable(left)
10717 && PyUnicode_CheckExact(right)
10718 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020010719 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10720 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010721 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010722 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010010723 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10724 {
10725 /* append inplace */
10726 if (unicode_resize(p_left, new_len) != 0) {
10727 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10728 * deallocated so it cannot be put back into
10729 * 'variable'. The MemoryError is raised when there
10730 * is no value in 'variable', which might (very
10731 * remotely) be a cause of incompatibilities.
10732 */
10733 goto error;
Victor Stinner23e56682011-10-03 03:54:37 +020010734 }
Victor Stinner488fa492011-12-12 00:01:39 +010010735 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerd3f08822012-05-29 12:57:52 +020010736 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020010737 }
Victor Stinner488fa492011-12-12 00:01:39 +010010738 else {
10739 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
10740 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Victor Stinnere6abb482012-05-02 01:15:40 +020010741 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020010742
Victor Stinner488fa492011-12-12 00:01:39 +010010743 /* Concat the two Unicode strings */
10744 res = PyUnicode_New(new_len, maxchar);
10745 if (res == NULL)
10746 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010747 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
10748 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010010749 Py_DECREF(left);
10750 *p_left = res;
10751 }
10752 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010753 return;
10754
10755error:
Victor Stinner488fa492011-12-12 00:01:39 +010010756 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010757}
10758
10759void
10760PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10761{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010762 PyUnicode_Append(pleft, right);
10763 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010764}
10765
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010766PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010767 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010768\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010769Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010770string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010771interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010772
10773static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010774unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010775{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010776 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010777 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010778 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010779 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010780 int kind1, kind2, kind;
10781 void *buf1, *buf2;
10782 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010783
Jesus Ceaac451502011-04-20 17:09:23 +020010784 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10785 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010786 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010787
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010788 kind1 = PyUnicode_KIND(self);
10789 kind2 = PyUnicode_KIND(substring);
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040010790 if (kind2 > kind1)
10791 return PyLong_FromLong(0);
10792 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010793 buf1 = PyUnicode_DATA(self);
10794 buf2 = PyUnicode_DATA(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010795 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010796 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010797 if (!buf2) {
10798 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010799 return NULL;
10800 }
10801 len1 = PyUnicode_GET_LENGTH(self);
10802 len2 = PyUnicode_GET_LENGTH(substring);
10803
10804 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060010805 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010806 case PyUnicode_1BYTE_KIND:
10807 iresult = ucs1lib_count(
10808 ((Py_UCS1*)buf1) + start, end - start,
10809 buf2, len2, PY_SSIZE_T_MAX
10810 );
10811 break;
10812 case PyUnicode_2BYTE_KIND:
10813 iresult = ucs2lib_count(
10814 ((Py_UCS2*)buf1) + start, end - start,
10815 buf2, len2, PY_SSIZE_T_MAX
10816 );
10817 break;
10818 case PyUnicode_4BYTE_KIND:
10819 iresult = ucs4lib_count(
10820 ((Py_UCS4*)buf1) + start, end - start,
10821 buf2, len2, PY_SSIZE_T_MAX
10822 );
10823 break;
10824 default:
10825 assert(0); iresult = 0;
10826 }
10827
10828 result = PyLong_FromSsize_t(iresult);
10829
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010830 if (kind2 != kind)
10831 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010832
10833 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010834
Guido van Rossumd57fd912000-03-10 22:53:23 +000010835 return result;
10836}
10837
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010838PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010839 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010840\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010841Encode S using the codec registered for encoding. Default encoding\n\
10842is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010843handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010844a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10845'xmlcharrefreplace' as well as any other name registered with\n\
10846codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010847
10848static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010849unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010850{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010851 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010852 char *encoding = NULL;
10853 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010854
Benjamin Peterson308d6372009-09-18 21:42:35 +000010855 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10856 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010857 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010858 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010859}
10860
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010861PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010862 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010863\n\
10864Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010865If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010866
10867static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010868unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010869{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010870 Py_ssize_t i, j, line_pos, src_len, incr;
10871 Py_UCS4 ch;
10872 PyObject *u;
10873 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010874 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010875 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010876 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010877
10878 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010879 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010880
Antoine Pitrou22425222011-10-04 19:10:51 +020010881 if (PyUnicode_READY(self) == -1)
10882 return NULL;
10883
Thomas Wouters7e474022000-07-16 12:04:32 +000010884 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010885 src_len = PyUnicode_GET_LENGTH(self);
10886 i = j = line_pos = 0;
10887 kind = PyUnicode_KIND(self);
10888 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010889 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010890 for (; i < src_len; i++) {
10891 ch = PyUnicode_READ(kind, src_data, i);
10892 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010893 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010894 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010895 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010896 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010897 goto overflow;
10898 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010899 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010900 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010901 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010902 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010903 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010904 goto overflow;
10905 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010906 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010907 if (ch == '\n' || ch == '\r')
10908 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010909 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010910 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010010911 if (!found)
10912 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010913
Guido van Rossumd57fd912000-03-10 22:53:23 +000010914 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010915 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010916 if (!u)
10917 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010918 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010919
Antoine Pitroue71d5742011-10-04 15:55:09 +020010920 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010921
Antoine Pitroue71d5742011-10-04 15:55:09 +020010922 for (; i < src_len; i++) {
10923 ch = PyUnicode_READ(kind, src_data, i);
10924 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010925 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010926 incr = tabsize - (line_pos % tabsize);
10927 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010010928 FILL(kind, dest_data, ' ', j, incr);
10929 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010930 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010931 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010932 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010933 line_pos++;
10934 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010935 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010936 if (ch == '\n' || ch == '\r')
10937 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010938 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010939 }
10940 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010010941 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010942
Antoine Pitroue71d5742011-10-04 15:55:09 +020010943 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010944 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10945 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010946}
10947
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010948PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010949 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010950\n\
10951Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010952such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010953arguments start and end are interpreted as in slice notation.\n\
10954\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010955Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010956
10957static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010958unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010959{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010960 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010961 Py_ssize_t start;
10962 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010963 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010964
Jesus Ceaac451502011-04-20 17:09:23 +020010965 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10966 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010967 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010968
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010969 if (PyUnicode_READY(self) == -1)
10970 return NULL;
10971 if (PyUnicode_READY(substring) == -1)
10972 return NULL;
10973
Victor Stinner7931d9a2011-11-04 00:22:48 +010010974 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010975
10976 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010977
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010978 if (result == -2)
10979 return NULL;
10980
Christian Heimes217cfd12007-12-02 14:31:20 +000010981 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010982}
10983
10984static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010985unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010986{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020010987 void *data;
10988 enum PyUnicode_Kind kind;
10989 Py_UCS4 ch;
10990 PyObject *res;
10991
10992 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
10993 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010994 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020010995 }
10996 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
10997 PyErr_SetString(PyExc_IndexError, "string index out of range");
10998 return NULL;
10999 }
11000 kind = PyUnicode_KIND(self);
11001 data = PyUnicode_DATA(self);
11002 ch = PyUnicode_READ(kind, data, index);
11003 if (ch < 256)
11004 return get_latin1_char(ch);
11005
11006 res = PyUnicode_New(1, ch);
11007 if (res == NULL)
11008 return NULL;
11009 kind = PyUnicode_KIND(res);
11010 data = PyUnicode_DATA(res);
11011 PyUnicode_WRITE(kind, data, 0, ch);
11012 assert(_PyUnicode_CheckConsistency(res, 1));
11013 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011014}
11015
Guido van Rossumc2504932007-09-18 19:42:40 +000011016/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011017 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011018static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011019unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011020{
Guido van Rossumc2504932007-09-18 19:42:40 +000011021 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010011022 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011023
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011024#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011025 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011026#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011027 if (_PyUnicode_HASH(self) != -1)
11028 return _PyUnicode_HASH(self);
11029 if (PyUnicode_READY(self) == -1)
11030 return -1;
11031 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011032 /*
11033 We make the hash of the empty string be 0, rather than using
11034 (prefix ^ suffix), since this slightly obfuscates the hash secret
11035 */
11036 if (len == 0) {
11037 _PyUnicode_HASH(self) = 0;
11038 return 0;
11039 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011040
11041 /* The hash function as a macro, gets expanded three times below. */
Georg Brandl2fb477c2012-02-21 00:33:36 +010011042#define HASH(P) \
11043 x ^= (Py_uhash_t) *P << 7; \
11044 while (--len >= 0) \
11045 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011046
Georg Brandl2fb477c2012-02-21 00:33:36 +010011047 x = (Py_uhash_t) _Py_HashSecret.prefix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011048 switch (PyUnicode_KIND(self)) {
11049 case PyUnicode_1BYTE_KIND: {
11050 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11051 HASH(c);
11052 break;
11053 }
11054 case PyUnicode_2BYTE_KIND: {
11055 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11056 HASH(s);
11057 break;
11058 }
11059 default: {
11060 Py_UCS4 *l;
11061 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11062 "Impossible switch case in unicode_hash");
11063 l = PyUnicode_4BYTE_DATA(self);
11064 HASH(l);
11065 break;
11066 }
11067 }
Georg Brandl2fb477c2012-02-21 00:33:36 +010011068 x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
11069 x ^= (Py_uhash_t) _Py_HashSecret.suffix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011070
Guido van Rossumc2504932007-09-18 19:42:40 +000011071 if (x == -1)
11072 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011073 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011074 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011075}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011076#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011077
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011078PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011079 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011080\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011081Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011082
11083static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011084unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011085{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011086 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011087 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011088 Py_ssize_t start;
11089 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011090
Jesus Ceaac451502011-04-20 17:09:23 +020011091 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11092 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011093 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011094
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011095 if (PyUnicode_READY(self) == -1)
11096 return NULL;
11097 if (PyUnicode_READY(substring) == -1)
11098 return NULL;
11099
Victor Stinner7931d9a2011-11-04 00:22:48 +010011100 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011101
11102 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011103
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011104 if (result == -2)
11105 return NULL;
11106
Guido van Rossumd57fd912000-03-10 22:53:23 +000011107 if (result < 0) {
11108 PyErr_SetString(PyExc_ValueError, "substring not found");
11109 return NULL;
11110 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011111
Christian Heimes217cfd12007-12-02 14:31:20 +000011112 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011113}
11114
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011115PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011116 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011117\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011118Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011119at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011120
11121static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011122unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011123{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011124 Py_ssize_t i, length;
11125 int kind;
11126 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011127 int cased;
11128
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011129 if (PyUnicode_READY(self) == -1)
11130 return NULL;
11131 length = PyUnicode_GET_LENGTH(self);
11132 kind = PyUnicode_KIND(self);
11133 data = PyUnicode_DATA(self);
11134
Guido van Rossumd57fd912000-03-10 22:53:23 +000011135 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011136 if (length == 1)
11137 return PyBool_FromLong(
11138 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011139
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011140 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011141 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011142 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011143
Guido van Rossumd57fd912000-03-10 22:53:23 +000011144 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011145 for (i = 0; i < length; i++) {
11146 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011147
Benjamin Peterson29060642009-01-31 22:14:21 +000011148 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11149 return PyBool_FromLong(0);
11150 else if (!cased && Py_UNICODE_ISLOWER(ch))
11151 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011152 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011153 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011154}
11155
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011156PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011157 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011158\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011159Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011160at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011161
11162static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011163unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011164{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011165 Py_ssize_t i, length;
11166 int kind;
11167 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011168 int cased;
11169
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011170 if (PyUnicode_READY(self) == -1)
11171 return NULL;
11172 length = PyUnicode_GET_LENGTH(self);
11173 kind = PyUnicode_KIND(self);
11174 data = PyUnicode_DATA(self);
11175
Guido van Rossumd57fd912000-03-10 22:53:23 +000011176 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011177 if (length == 1)
11178 return PyBool_FromLong(
11179 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011180
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011181 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011182 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011183 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011184
Guido van Rossumd57fd912000-03-10 22:53:23 +000011185 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011186 for (i = 0; i < length; i++) {
11187 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011188
Benjamin Peterson29060642009-01-31 22:14:21 +000011189 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11190 return PyBool_FromLong(0);
11191 else if (!cased && Py_UNICODE_ISUPPER(ch))
11192 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011193 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011194 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011195}
11196
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011197PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011198 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011199\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011200Return True if S is a titlecased string and there is at least one\n\
11201character in S, i.e. upper- and titlecase characters may only\n\
11202follow uncased characters and lowercase characters only cased ones.\n\
11203Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011204
11205static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011206unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011207{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011208 Py_ssize_t i, length;
11209 int kind;
11210 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011211 int cased, previous_is_cased;
11212
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011213 if (PyUnicode_READY(self) == -1)
11214 return NULL;
11215 length = PyUnicode_GET_LENGTH(self);
11216 kind = PyUnicode_KIND(self);
11217 data = PyUnicode_DATA(self);
11218
Guido van Rossumd57fd912000-03-10 22:53:23 +000011219 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011220 if (length == 1) {
11221 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11222 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11223 (Py_UNICODE_ISUPPER(ch) != 0));
11224 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011225
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011226 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011227 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011228 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011229
Guido van Rossumd57fd912000-03-10 22:53:23 +000011230 cased = 0;
11231 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011232 for (i = 0; i < length; i++) {
11233 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011234
Benjamin Peterson29060642009-01-31 22:14:21 +000011235 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11236 if (previous_is_cased)
11237 return PyBool_FromLong(0);
11238 previous_is_cased = 1;
11239 cased = 1;
11240 }
11241 else if (Py_UNICODE_ISLOWER(ch)) {
11242 if (!previous_is_cased)
11243 return PyBool_FromLong(0);
11244 previous_is_cased = 1;
11245 cased = 1;
11246 }
11247 else
11248 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011249 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011250 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011251}
11252
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011253PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011254 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011255\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011256Return True if all characters in S are whitespace\n\
11257and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011258
11259static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011260unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011261{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011262 Py_ssize_t i, length;
11263 int kind;
11264 void *data;
11265
11266 if (PyUnicode_READY(self) == -1)
11267 return NULL;
11268 length = PyUnicode_GET_LENGTH(self);
11269 kind = PyUnicode_KIND(self);
11270 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011271
Guido van Rossumd57fd912000-03-10 22:53:23 +000011272 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011273 if (length == 1)
11274 return PyBool_FromLong(
11275 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011276
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011277 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011278 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011279 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011280
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011281 for (i = 0; i < length; i++) {
11282 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011283 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011284 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011285 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011286 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011287}
11288
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011289PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011290 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011291\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011292Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011293and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011294
11295static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011296unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011297{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011298 Py_ssize_t i, length;
11299 int kind;
11300 void *data;
11301
11302 if (PyUnicode_READY(self) == -1)
11303 return NULL;
11304 length = PyUnicode_GET_LENGTH(self);
11305 kind = PyUnicode_KIND(self);
11306 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011307
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011308 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011309 if (length == 1)
11310 return PyBool_FromLong(
11311 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011312
11313 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011314 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011315 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011316
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011317 for (i = 0; i < length; i++) {
11318 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011319 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011320 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011321 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011322}
11323
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011324PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011325 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011326\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011327Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011328and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011329
11330static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011331unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011332{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011333 int kind;
11334 void *data;
11335 Py_ssize_t len, i;
11336
11337 if (PyUnicode_READY(self) == -1)
11338 return NULL;
11339
11340 kind = PyUnicode_KIND(self);
11341 data = PyUnicode_DATA(self);
11342 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011343
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011344 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011345 if (len == 1) {
11346 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11347 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11348 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011349
11350 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011351 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011352 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011353
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011354 for (i = 0; i < len; i++) {
11355 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011356 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011357 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011358 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011359 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011360}
11361
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011362PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011363 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011364\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011365Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011366False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011367
11368static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011369unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011370{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011371 Py_ssize_t i, length;
11372 int kind;
11373 void *data;
11374
11375 if (PyUnicode_READY(self) == -1)
11376 return NULL;
11377 length = PyUnicode_GET_LENGTH(self);
11378 kind = PyUnicode_KIND(self);
11379 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011380
Guido van Rossumd57fd912000-03-10 22:53:23 +000011381 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011382 if (length == 1)
11383 return PyBool_FromLong(
11384 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011385
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011386 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011387 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011388 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011389
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011390 for (i = 0; i < length; i++) {
11391 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011392 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011393 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011394 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011395}
11396
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011397PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011398 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011399\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011400Return True if all characters in S are digits\n\
11401and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011402
11403static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011404unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011405{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011406 Py_ssize_t i, length;
11407 int kind;
11408 void *data;
11409
11410 if (PyUnicode_READY(self) == -1)
11411 return NULL;
11412 length = PyUnicode_GET_LENGTH(self);
11413 kind = PyUnicode_KIND(self);
11414 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011415
Guido van Rossumd57fd912000-03-10 22:53:23 +000011416 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011417 if (length == 1) {
11418 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11419 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11420 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011421
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011422 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011423 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011424 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011425
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011426 for (i = 0; i < length; i++) {
11427 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011428 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011429 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011430 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011431}
11432
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011433PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011434 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011435\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011436Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011437False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011438
11439static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011440unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011441{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011442 Py_ssize_t i, length;
11443 int kind;
11444 void *data;
11445
11446 if (PyUnicode_READY(self) == -1)
11447 return NULL;
11448 length = PyUnicode_GET_LENGTH(self);
11449 kind = PyUnicode_KIND(self);
11450 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011451
Guido van Rossumd57fd912000-03-10 22:53:23 +000011452 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011453 if (length == 1)
11454 return PyBool_FromLong(
11455 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011456
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011457 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011458 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011459 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011460
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011461 for (i = 0; i < length; i++) {
11462 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011463 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011464 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011465 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011466}
11467
Martin v. Löwis47383402007-08-15 07:32:56 +000011468int
11469PyUnicode_IsIdentifier(PyObject *self)
11470{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011471 int kind;
11472 void *data;
11473 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011474 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011475
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011476 if (PyUnicode_READY(self) == -1) {
11477 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011478 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011479 }
11480
11481 /* Special case for empty strings */
11482 if (PyUnicode_GET_LENGTH(self) == 0)
11483 return 0;
11484 kind = PyUnicode_KIND(self);
11485 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011486
11487 /* PEP 3131 says that the first character must be in
11488 XID_Start and subsequent characters in XID_Continue,
11489 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011490 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011491 letters, digits, underscore). However, given the current
11492 definition of XID_Start and XID_Continue, it is sufficient
11493 to check just for these, except that _ must be allowed
11494 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011495 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011496 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011497 return 0;
11498
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011499 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011500 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011501 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011502 return 1;
11503}
11504
11505PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011506 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011507\n\
11508Return True if S is a valid identifier according\n\
11509to the language definition.");
11510
11511static PyObject*
11512unicode_isidentifier(PyObject *self)
11513{
11514 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11515}
11516
Georg Brandl559e5d72008-06-11 18:37:52 +000011517PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011518 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011519\n\
11520Return True if all characters in S are considered\n\
11521printable in repr() or S is empty, False otherwise.");
11522
11523static PyObject*
11524unicode_isprintable(PyObject *self)
11525{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011526 Py_ssize_t i, length;
11527 int kind;
11528 void *data;
11529
11530 if (PyUnicode_READY(self) == -1)
11531 return NULL;
11532 length = PyUnicode_GET_LENGTH(self);
11533 kind = PyUnicode_KIND(self);
11534 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011535
11536 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011537 if (length == 1)
11538 return PyBool_FromLong(
11539 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011540
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011541 for (i = 0; i < length; i++) {
11542 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011543 Py_RETURN_FALSE;
11544 }
11545 }
11546 Py_RETURN_TRUE;
11547}
11548
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011549PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011550 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011551\n\
11552Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011553iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011554
11555static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011556unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011557{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011558 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011559}
11560
Martin v. Löwis18e16552006-02-15 17:27:45 +000011561static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011562unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011563{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011564 if (PyUnicode_READY(self) == -1)
11565 return -1;
11566 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011567}
11568
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011569PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011570 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011571\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011572Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011573done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011574
11575static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011576unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011577{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011578 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011579 Py_UCS4 fillchar = ' ';
11580
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011581 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011582 return NULL;
11583
Benjamin Petersonbac79492012-01-14 13:34:47 -050011584 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011585 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011586
Victor Stinnerc4b49542011-12-11 22:44:26 +010011587 if (PyUnicode_GET_LENGTH(self) >= width)
11588 return unicode_result_unchanged(self);
11589
11590 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011591}
11592
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011593PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011594 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011595\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011596Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011597
11598static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011599unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011600{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011601 if (PyUnicode_READY(self) == -1)
11602 return NULL;
11603 if (PyUnicode_IS_ASCII(self))
11604 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011605 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011606}
11607
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011608#define LEFTSTRIP 0
11609#define RIGHTSTRIP 1
11610#define BOTHSTRIP 2
11611
11612/* Arrays indexed by above */
11613static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11614
11615#define STRIPNAME(i) (stripformat[i]+3)
11616
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011617/* externally visible for str.strip(unicode) */
11618PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011619_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011620{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011621 void *data;
11622 int kind;
11623 Py_ssize_t i, j, len;
11624 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011625
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011626 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11627 return NULL;
11628
11629 kind = PyUnicode_KIND(self);
11630 data = PyUnicode_DATA(self);
11631 len = PyUnicode_GET_LENGTH(self);
11632 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11633 PyUnicode_DATA(sepobj),
11634 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011635
Benjamin Peterson14339b62009-01-31 16:36:08 +000011636 i = 0;
11637 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011638 while (i < len &&
11639 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011640 i++;
11641 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011642 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011643
Benjamin Peterson14339b62009-01-31 16:36:08 +000011644 j = len;
11645 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011646 do {
11647 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011648 } while (j >= i &&
11649 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011650 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011651 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011652
Victor Stinner7931d9a2011-11-04 00:22:48 +010011653 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011654}
11655
11656PyObject*
11657PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11658{
11659 unsigned char *data;
11660 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011661 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011662
Victor Stinnerde636f32011-10-01 03:55:54 +020011663 if (PyUnicode_READY(self) == -1)
11664 return NULL;
11665
Victor Stinner684d5fd2012-05-03 02:32:34 +020011666 length = PyUnicode_GET_LENGTH(self);
11667 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020011668
Victor Stinner684d5fd2012-05-03 02:32:34 +020011669 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011670 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011671
Victor Stinnerde636f32011-10-01 03:55:54 +020011672 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011673 PyErr_SetString(PyExc_IndexError, "string index out of range");
11674 return NULL;
11675 }
Victor Stinner684d5fd2012-05-03 02:32:34 +020011676 if (start >= length || end < start) {
Victor Stinner3a7f79772012-05-03 03:36:40 +020011677 Py_INCREF(unicode_empty);
11678 return unicode_empty;
Victor Stinner684d5fd2012-05-03 02:32:34 +020011679 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020011680
Victor Stinner684d5fd2012-05-03 02:32:34 +020011681 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020011682 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020011683 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020011684 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020011685 }
11686 else {
11687 kind = PyUnicode_KIND(self);
11688 data = PyUnicode_1BYTE_DATA(self);
11689 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011690 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011691 length);
11692 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011693}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011694
11695static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011696do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011697{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011698 int kind;
11699 void *data;
11700 Py_ssize_t len, i, j;
11701
11702 if (PyUnicode_READY(self) == -1)
11703 return NULL;
11704
11705 kind = PyUnicode_KIND(self);
11706 data = PyUnicode_DATA(self);
11707 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011708
Benjamin Peterson14339b62009-01-31 16:36:08 +000011709 i = 0;
11710 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011711 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011712 i++;
11713 }
11714 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011715
Benjamin Peterson14339b62009-01-31 16:36:08 +000011716 j = len;
11717 if (striptype != LEFTSTRIP) {
11718 do {
11719 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011720 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011721 j++;
11722 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011723
Victor Stinner7931d9a2011-11-04 00:22:48 +010011724 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011725}
11726
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011727
11728static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011729do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011730{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011731 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011732
Benjamin Peterson14339b62009-01-31 16:36:08 +000011733 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11734 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011735
Benjamin Peterson14339b62009-01-31 16:36:08 +000011736 if (sep != NULL && sep != Py_None) {
11737 if (PyUnicode_Check(sep))
11738 return _PyUnicode_XStrip(self, striptype, sep);
11739 else {
11740 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011741 "%s arg must be None or str",
11742 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011743 return NULL;
11744 }
11745 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011746
Benjamin Peterson14339b62009-01-31 16:36:08 +000011747 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011748}
11749
11750
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011751PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011752 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011753\n\
11754Return a copy of the string S with leading and trailing\n\
11755whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011756If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011757
11758static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011759unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011760{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011761 if (PyTuple_GET_SIZE(args) == 0)
11762 return do_strip(self, BOTHSTRIP); /* Common case */
11763 else
11764 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011765}
11766
11767
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011768PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011769 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011770\n\
11771Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011772If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011773
11774static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011775unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011776{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011777 if (PyTuple_GET_SIZE(args) == 0)
11778 return do_strip(self, LEFTSTRIP); /* Common case */
11779 else
11780 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011781}
11782
11783
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011784PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011785 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011786\n\
11787Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011788If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011789
11790static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011791unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011792{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011793 if (PyTuple_GET_SIZE(args) == 0)
11794 return do_strip(self, RIGHTSTRIP); /* Common case */
11795 else
11796 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011797}
11798
11799
Guido van Rossumd57fd912000-03-10 22:53:23 +000011800static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011801unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011802{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011803 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011804 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011805
Georg Brandl222de0f2009-04-12 12:01:50 +000011806 if (len < 1) {
11807 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011808 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011809 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011810
Victor Stinnerc4b49542011-12-11 22:44:26 +010011811 /* no repeat, return original string */
11812 if (len == 1)
11813 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000011814
Benjamin Petersonbac79492012-01-14 13:34:47 -050011815 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011816 return NULL;
11817
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011818 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011819 PyErr_SetString(PyExc_OverflowError,
11820 "repeated string is too long");
11821 return NULL;
11822 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011823 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011824
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011825 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011826 if (!u)
11827 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011828 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011829
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011830 if (PyUnicode_GET_LENGTH(str) == 1) {
11831 const int kind = PyUnicode_KIND(str);
11832 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010011833 if (kind == PyUnicode_1BYTE_KIND) {
11834 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011835 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010011836 }
11837 else if (kind == PyUnicode_2BYTE_KIND) {
11838 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011839 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010011840 ucs2[n] = fill_char;
11841 } else {
11842 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
11843 assert(kind == PyUnicode_4BYTE_KIND);
11844 for (n = 0; n < len; ++n)
11845 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011846 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011847 }
11848 else {
11849 /* number of characters copied this far */
11850 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011851 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011852 char *to = (char *) PyUnicode_DATA(u);
11853 Py_MEMCPY(to, PyUnicode_DATA(str),
11854 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011855 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011856 n = (done <= nchars-done) ? done : nchars-done;
11857 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011858 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011859 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011860 }
11861
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011862 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011863 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011864}
11865
Alexander Belopolsky40018472011-02-26 01:02:56 +000011866PyObject *
11867PyUnicode_Replace(PyObject *obj,
11868 PyObject *subobj,
11869 PyObject *replobj,
11870 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011871{
11872 PyObject *self;
11873 PyObject *str1;
11874 PyObject *str2;
11875 PyObject *result;
11876
11877 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011878 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011879 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011880 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011881 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011882 Py_DECREF(self);
11883 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011884 }
11885 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011886 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011887 Py_DECREF(self);
11888 Py_DECREF(str1);
11889 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011890 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060011891 if (PyUnicode_READY(self) == -1 ||
11892 PyUnicode_READY(str1) == -1 ||
11893 PyUnicode_READY(str2) == -1)
11894 result = NULL;
11895 else
11896 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011897 Py_DECREF(self);
11898 Py_DECREF(str1);
11899 Py_DECREF(str2);
11900 return result;
11901}
11902
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011903PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011904 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011905\n\
11906Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011907old replaced by new. If the optional argument count is\n\
11908given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011909
11910static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011911unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011912{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011913 PyObject *str1;
11914 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011915 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011916 PyObject *result;
11917
Martin v. Löwis18e16552006-02-15 17:27:45 +000011918 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011919 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060011920 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011921 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011922 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011923 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011924 return NULL;
11925 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011926 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011927 Py_DECREF(str1);
11928 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011929 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060011930 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
11931 result = NULL;
11932 else
11933 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011934
11935 Py_DECREF(str1);
11936 Py_DECREF(str2);
11937 return result;
11938}
11939
Alexander Belopolsky40018472011-02-26 01:02:56 +000011940static PyObject *
11941unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011942{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011943 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011944 Py_ssize_t isize;
11945 Py_ssize_t osize, squote, dquote, i, o;
11946 Py_UCS4 max, quote;
11947 int ikind, okind;
11948 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011949
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011950 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011951 return NULL;
11952
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011953 isize = PyUnicode_GET_LENGTH(unicode);
11954 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011955
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011956 /* Compute length of output, quote characters, and
11957 maximum character */
11958 osize = 2; /* quotes */
11959 max = 127;
11960 squote = dquote = 0;
11961 ikind = PyUnicode_KIND(unicode);
11962 for (i = 0; i < isize; i++) {
11963 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11964 switch (ch) {
11965 case '\'': squote++; osize++; break;
11966 case '"': dquote++; osize++; break;
11967 case '\\': case '\t': case '\r': case '\n':
11968 osize += 2; break;
11969 default:
11970 /* Fast-path ASCII */
11971 if (ch < ' ' || ch == 0x7f)
11972 osize += 4; /* \xHH */
11973 else if (ch < 0x7f)
11974 osize++;
11975 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11976 osize++;
11977 max = ch > max ? ch : max;
11978 }
11979 else if (ch < 0x100)
11980 osize += 4; /* \xHH */
11981 else if (ch < 0x10000)
11982 osize += 6; /* \uHHHH */
11983 else
11984 osize += 10; /* \uHHHHHHHH */
11985 }
11986 }
11987
11988 quote = '\'';
11989 if (squote) {
11990 if (dquote)
11991 /* Both squote and dquote present. Use squote,
11992 and escape them */
11993 osize += squote;
11994 else
11995 quote = '"';
11996 }
11997
11998 repr = PyUnicode_New(osize, max);
11999 if (repr == NULL)
12000 return NULL;
12001 okind = PyUnicode_KIND(repr);
12002 odata = PyUnicode_DATA(repr);
12003
12004 PyUnicode_WRITE(okind, odata, 0, quote);
12005 PyUnicode_WRITE(okind, odata, osize-1, quote);
12006
12007 for (i = 0, o = 1; i < isize; i++) {
12008 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012009
12010 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012011 if ((ch == quote) || (ch == '\\')) {
12012 PyUnicode_WRITE(okind, odata, o++, '\\');
12013 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012014 continue;
12015 }
12016
Benjamin Peterson29060642009-01-31 22:14:21 +000012017 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012018 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012019 PyUnicode_WRITE(okind, odata, o++, '\\');
12020 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012021 }
12022 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012023 PyUnicode_WRITE(okind, odata, o++, '\\');
12024 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012025 }
12026 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012027 PyUnicode_WRITE(okind, odata, o++, '\\');
12028 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012029 }
12030
12031 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012032 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012033 PyUnicode_WRITE(okind, odata, o++, '\\');
12034 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012035 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12036 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012037 }
12038
Georg Brandl559e5d72008-06-11 18:37:52 +000012039 /* Copy ASCII characters as-is */
12040 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012041 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012042 }
12043
Benjamin Peterson29060642009-01-31 22:14:21 +000012044 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000012045 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012046 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000012047 (categories Z* and C* except ASCII space)
12048 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012049 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012050 PyUnicode_WRITE(okind, odata, o++, '\\');
Georg Brandl559e5d72008-06-11 18:37:52 +000012051 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012052 if (ch <= 0xff) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012053 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012054 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12055 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012056 }
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012057 /* Map 16-bit characters to '\uxxxx' */
12058 else if (ch <= 0xffff) {
12059 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012060 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12061 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12062 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12063 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012064 }
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012065 /* Map 21-bit characters to '\U00xxxxxx' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012066 else {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012067 PyUnicode_WRITE(okind, odata, o++, 'U');
12068 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12069 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12070 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12071 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
Victor Stinnerf5cff562011-10-14 02:13:11 +020012072 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12073 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12074 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12075 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012076 }
12077 }
12078 /* Copy characters as-is */
12079 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012080 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012081 }
12082 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012083 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012084 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012085 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012086 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012087}
12088
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012089PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012090 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012091\n\
12092Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012093such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012094arguments start and end are interpreted as in slice notation.\n\
12095\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012096Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012097
12098static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012099unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012100{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012101 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012102 Py_ssize_t start;
12103 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012104 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012105
Jesus Ceaac451502011-04-20 17:09:23 +020012106 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12107 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012108 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012109
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012110 if (PyUnicode_READY(self) == -1)
12111 return NULL;
12112 if (PyUnicode_READY(substring) == -1)
12113 return NULL;
12114
Victor Stinner7931d9a2011-11-04 00:22:48 +010012115 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012116
12117 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012118
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012119 if (result == -2)
12120 return NULL;
12121
Christian Heimes217cfd12007-12-02 14:31:20 +000012122 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012123}
12124
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012125PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012126 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012127\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012128Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012129
12130static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012131unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012132{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012133 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012134 Py_ssize_t start;
12135 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012136 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012137
Jesus Ceaac451502011-04-20 17:09:23 +020012138 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12139 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012140 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012141
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012142 if (PyUnicode_READY(self) == -1)
12143 return NULL;
12144 if (PyUnicode_READY(substring) == -1)
12145 return NULL;
12146
Victor Stinner7931d9a2011-11-04 00:22:48 +010012147 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012148
12149 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012150
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012151 if (result == -2)
12152 return NULL;
12153
Guido van Rossumd57fd912000-03-10 22:53:23 +000012154 if (result < 0) {
12155 PyErr_SetString(PyExc_ValueError, "substring not found");
12156 return NULL;
12157 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012158
Christian Heimes217cfd12007-12-02 14:31:20 +000012159 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012160}
12161
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012162PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012163 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012164\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012165Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012166done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012167
12168static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012169unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012170{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012171 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012172 Py_UCS4 fillchar = ' ';
12173
Victor Stinnere9a29352011-10-01 02:14:59 +020012174 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012175 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012176
Benjamin Petersonbac79492012-01-14 13:34:47 -050012177 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012178 return NULL;
12179
Victor Stinnerc4b49542011-12-11 22:44:26 +010012180 if (PyUnicode_GET_LENGTH(self) >= width)
12181 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012182
Victor Stinnerc4b49542011-12-11 22:44:26 +010012183 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012184}
12185
Alexander Belopolsky40018472011-02-26 01:02:56 +000012186PyObject *
12187PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012188{
12189 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012190
Guido van Rossumd57fd912000-03-10 22:53:23 +000012191 s = PyUnicode_FromObject(s);
12192 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012193 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012194 if (sep != NULL) {
12195 sep = PyUnicode_FromObject(sep);
12196 if (sep == NULL) {
12197 Py_DECREF(s);
12198 return NULL;
12199 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012200 }
12201
Victor Stinner9310abb2011-10-05 00:59:23 +020012202 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012203
12204 Py_DECREF(s);
12205 Py_XDECREF(sep);
12206 return result;
12207}
12208
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012209PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012210 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012211\n\
12212Return a list of the words in S, using sep as the\n\
12213delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012214splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012215whitespace string is a separator and empty strings are\n\
12216removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012217
12218static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012219unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012220{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012221 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012222 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012223 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012224
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012225 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12226 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012227 return NULL;
12228
12229 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012230 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012231 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012232 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012233 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012234 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012235}
12236
Thomas Wouters477c8d52006-05-27 19:21:47 +000012237PyObject *
12238PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12239{
12240 PyObject* str_obj;
12241 PyObject* sep_obj;
12242 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012243 int kind1, kind2, kind;
12244 void *buf1 = NULL, *buf2 = NULL;
12245 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012246
12247 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012248 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012249 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012250 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012251 if (!sep_obj) {
12252 Py_DECREF(str_obj);
12253 return NULL;
12254 }
12255 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12256 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012257 Py_DECREF(str_obj);
12258 return NULL;
12259 }
12260
Victor Stinner14f8f022011-10-05 20:58:25 +020012261 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012262 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012263 kind = Py_MAX(kind1, kind2);
12264 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012265 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012266 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012267 if (!buf1)
12268 goto onError;
12269 buf2 = PyUnicode_DATA(sep_obj);
12270 if (kind2 != kind)
12271 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12272 if (!buf2)
12273 goto onError;
12274 len1 = PyUnicode_GET_LENGTH(str_obj);
12275 len2 = PyUnicode_GET_LENGTH(sep_obj);
12276
Benjamin Petersonead6b532011-12-20 17:23:42 -060012277 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012278 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012279 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12280 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12281 else
12282 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012283 break;
12284 case PyUnicode_2BYTE_KIND:
12285 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12286 break;
12287 case PyUnicode_4BYTE_KIND:
12288 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12289 break;
12290 default:
12291 assert(0);
12292 out = 0;
12293 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012294
12295 Py_DECREF(sep_obj);
12296 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012297 if (kind1 != kind)
12298 PyMem_Free(buf1);
12299 if (kind2 != kind)
12300 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012301
12302 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012303 onError:
12304 Py_DECREF(sep_obj);
12305 Py_DECREF(str_obj);
12306 if (kind1 != kind && buf1)
12307 PyMem_Free(buf1);
12308 if (kind2 != kind && buf2)
12309 PyMem_Free(buf2);
12310 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012311}
12312
12313
12314PyObject *
12315PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12316{
12317 PyObject* str_obj;
12318 PyObject* sep_obj;
12319 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012320 int kind1, kind2, kind;
12321 void *buf1 = NULL, *buf2 = NULL;
12322 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012323
12324 str_obj = PyUnicode_FromObject(str_in);
12325 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012326 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012327 sep_obj = PyUnicode_FromObject(sep_in);
12328 if (!sep_obj) {
12329 Py_DECREF(str_obj);
12330 return NULL;
12331 }
12332
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012333 kind1 = PyUnicode_KIND(str_in);
12334 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012335 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012336 buf1 = PyUnicode_DATA(str_in);
12337 if (kind1 != kind)
12338 buf1 = _PyUnicode_AsKind(str_in, kind);
12339 if (!buf1)
12340 goto onError;
12341 buf2 = PyUnicode_DATA(sep_obj);
12342 if (kind2 != kind)
12343 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12344 if (!buf2)
12345 goto onError;
12346 len1 = PyUnicode_GET_LENGTH(str_obj);
12347 len2 = PyUnicode_GET_LENGTH(sep_obj);
12348
Benjamin Petersonead6b532011-12-20 17:23:42 -060012349 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012350 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012351 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12352 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12353 else
12354 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012355 break;
12356 case PyUnicode_2BYTE_KIND:
12357 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12358 break;
12359 case PyUnicode_4BYTE_KIND:
12360 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12361 break;
12362 default:
12363 assert(0);
12364 out = 0;
12365 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012366
12367 Py_DECREF(sep_obj);
12368 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012369 if (kind1 != kind)
12370 PyMem_Free(buf1);
12371 if (kind2 != kind)
12372 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012373
12374 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012375 onError:
12376 Py_DECREF(sep_obj);
12377 Py_DECREF(str_obj);
12378 if (kind1 != kind && buf1)
12379 PyMem_Free(buf1);
12380 if (kind2 != kind && buf2)
12381 PyMem_Free(buf2);
12382 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012383}
12384
12385PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012386 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012387\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012388Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012389the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012390found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012391
12392static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012393unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012394{
Victor Stinner9310abb2011-10-05 00:59:23 +020012395 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012396}
12397
12398PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012399 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012400\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012401Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012402the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012403separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012404
12405static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012406unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012407{
Victor Stinner9310abb2011-10-05 00:59:23 +020012408 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012409}
12410
Alexander Belopolsky40018472011-02-26 01:02:56 +000012411PyObject *
12412PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012413{
12414 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012415
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012416 s = PyUnicode_FromObject(s);
12417 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012418 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012419 if (sep != NULL) {
12420 sep = PyUnicode_FromObject(sep);
12421 if (sep == NULL) {
12422 Py_DECREF(s);
12423 return NULL;
12424 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012425 }
12426
Victor Stinner9310abb2011-10-05 00:59:23 +020012427 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012428
12429 Py_DECREF(s);
12430 Py_XDECREF(sep);
12431 return result;
12432}
12433
12434PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012435 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012436\n\
12437Return a list of the words in S, using sep as the\n\
12438delimiter string, starting at the end of the string and\n\
12439working to the front. If maxsplit is given, at most maxsplit\n\
12440splits are done. If sep is not specified, any whitespace string\n\
12441is a separator.");
12442
12443static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012444unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012445{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012446 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012447 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012448 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012449
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012450 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12451 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012452 return NULL;
12453
12454 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012455 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012456 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012457 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012458 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012459 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012460}
12461
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012462PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012463 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012464\n\
12465Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012466Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012467is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012468
12469static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012470unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012471{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012472 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012473 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012474
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012475 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12476 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012477 return NULL;
12478
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012479 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012480}
12481
12482static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012483PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012484{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012485 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012486}
12487
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012488PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012489 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012490\n\
12491Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012492and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012493
12494static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012495unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012496{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012497 if (PyUnicode_READY(self) == -1)
12498 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012499 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012500}
12501
Georg Brandlceee0772007-11-27 23:48:05 +000012502PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012503 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012504\n\
12505Return a translation table usable for str.translate().\n\
12506If there is only one argument, it must be a dictionary mapping Unicode\n\
12507ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012508Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012509If there are two arguments, they must be strings of equal length, and\n\
12510in the resulting dictionary, each character in x will be mapped to the\n\
12511character at the same position in y. If there is a third argument, it\n\
12512must be a string, whose characters will be mapped to None in the result.");
12513
12514static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012515unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012516{
12517 PyObject *x, *y = NULL, *z = NULL;
12518 PyObject *new = NULL, *key, *value;
12519 Py_ssize_t i = 0;
12520 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012521
Georg Brandlceee0772007-11-27 23:48:05 +000012522 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12523 return NULL;
12524 new = PyDict_New();
12525 if (!new)
12526 return NULL;
12527 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012528 int x_kind, y_kind, z_kind;
12529 void *x_data, *y_data, *z_data;
12530
Georg Brandlceee0772007-11-27 23:48:05 +000012531 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012532 if (!PyUnicode_Check(x)) {
12533 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12534 "be a string if there is a second argument");
12535 goto err;
12536 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012537 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012538 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12539 "arguments must have equal length");
12540 goto err;
12541 }
12542 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012543 x_kind = PyUnicode_KIND(x);
12544 y_kind = PyUnicode_KIND(y);
12545 x_data = PyUnicode_DATA(x);
12546 y_data = PyUnicode_DATA(y);
12547 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12548 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012549 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012550 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012551 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012552 if (!value) {
12553 Py_DECREF(key);
12554 goto err;
12555 }
Georg Brandlceee0772007-11-27 23:48:05 +000012556 res = PyDict_SetItem(new, key, value);
12557 Py_DECREF(key);
12558 Py_DECREF(value);
12559 if (res < 0)
12560 goto err;
12561 }
12562 /* create entries for deleting chars in z */
12563 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012564 z_kind = PyUnicode_KIND(z);
12565 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012566 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012567 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012568 if (!key)
12569 goto err;
12570 res = PyDict_SetItem(new, key, Py_None);
12571 Py_DECREF(key);
12572 if (res < 0)
12573 goto err;
12574 }
12575 }
12576 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012577 int kind;
12578 void *data;
12579
Georg Brandlceee0772007-11-27 23:48:05 +000012580 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012581 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012582 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12583 "to maketrans it must be a dict");
12584 goto err;
12585 }
12586 /* copy entries into the new dict, converting string keys to int keys */
12587 while (PyDict_Next(x, &i, &key, &value)) {
12588 if (PyUnicode_Check(key)) {
12589 /* convert string keys to integer keys */
12590 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012591 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012592 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12593 "table must be of length 1");
12594 goto err;
12595 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012596 kind = PyUnicode_KIND(key);
12597 data = PyUnicode_DATA(key);
12598 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012599 if (!newkey)
12600 goto err;
12601 res = PyDict_SetItem(new, newkey, value);
12602 Py_DECREF(newkey);
12603 if (res < 0)
12604 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012605 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012606 /* just keep integer keys */
12607 if (PyDict_SetItem(new, key, value) < 0)
12608 goto err;
12609 } else {
12610 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12611 "be strings or integers");
12612 goto err;
12613 }
12614 }
12615 }
12616 return new;
12617 err:
12618 Py_DECREF(new);
12619 return NULL;
12620}
12621
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012622PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012623 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012624\n\
12625Return a copy of the string S, where all characters have been mapped\n\
12626through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012627Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012628Unmapped characters are left untouched. Characters mapped to None\n\
12629are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012630
12631static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012632unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012633{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012634 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012635}
12636
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012637PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012638 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012639\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012640Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012641
12642static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012643unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012644{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012645 if (PyUnicode_READY(self) == -1)
12646 return NULL;
12647 if (PyUnicode_IS_ASCII(self))
12648 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012649 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012650}
12651
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012652PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012653 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012654\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012655Pad a numeric string S with zeros on the left, to fill a field\n\
12656of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012657
12658static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012659unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012660{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012661 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012662 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012663 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012664 int kind;
12665 void *data;
12666 Py_UCS4 chr;
12667
Martin v. Löwis18e16552006-02-15 17:27:45 +000012668 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012669 return NULL;
12670
Benjamin Petersonbac79492012-01-14 13:34:47 -050012671 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012672 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012673
Victor Stinnerc4b49542011-12-11 22:44:26 +010012674 if (PyUnicode_GET_LENGTH(self) >= width)
12675 return unicode_result_unchanged(self);
12676
12677 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012678
12679 u = pad(self, fill, 0, '0');
12680
Walter Dörwald068325e2002-04-15 13:36:47 +000012681 if (u == NULL)
12682 return NULL;
12683
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012684 kind = PyUnicode_KIND(u);
12685 data = PyUnicode_DATA(u);
12686 chr = PyUnicode_READ(kind, data, fill);
12687
12688 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012689 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012690 PyUnicode_WRITE(kind, data, 0, chr);
12691 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012692 }
12693
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012694 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012695 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012696}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012697
12698#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012699static PyObject *
12700unicode__decimal2ascii(PyObject *self)
12701{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012702 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012703}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012704#endif
12705
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012706PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012707 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012708\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012709Return True if S starts with the specified prefix, False otherwise.\n\
12710With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012711With optional end, stop comparing S at that position.\n\
12712prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012713
12714static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012715unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012716 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012717{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012718 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012719 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012720 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012721 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012722 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012723
Jesus Ceaac451502011-04-20 17:09:23 +020012724 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012725 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012726 if (PyTuple_Check(subobj)) {
12727 Py_ssize_t i;
12728 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012729 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012730 if (substring == NULL)
12731 return NULL;
12732 result = tailmatch(self, substring, start, end, -1);
12733 Py_DECREF(substring);
12734 if (result) {
12735 Py_RETURN_TRUE;
12736 }
12737 }
12738 /* nothing matched */
12739 Py_RETURN_FALSE;
12740 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012741 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012742 if (substring == NULL) {
12743 if (PyErr_ExceptionMatches(PyExc_TypeError))
12744 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12745 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012746 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012747 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012748 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012749 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012750 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012751}
12752
12753
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012754PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012755 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012756\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012757Return True if S ends with the specified suffix, False otherwise.\n\
12758With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012759With optional end, stop comparing S at that position.\n\
12760suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012761
12762static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012763unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012764 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012765{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012766 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012767 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012768 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012769 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012770 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012771
Jesus Ceaac451502011-04-20 17:09:23 +020012772 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012773 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012774 if (PyTuple_Check(subobj)) {
12775 Py_ssize_t i;
12776 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012777 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012778 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012779 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012780 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012781 result = tailmatch(self, substring, start, end, +1);
12782 Py_DECREF(substring);
12783 if (result) {
12784 Py_RETURN_TRUE;
12785 }
12786 }
12787 Py_RETURN_FALSE;
12788 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012789 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012790 if (substring == NULL) {
12791 if (PyErr_ExceptionMatches(PyExc_TypeError))
12792 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12793 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012794 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012795 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012796 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012797 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012798 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012799}
12800
Victor Stinner202fdca2012-05-07 12:47:02 +020012801Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012802_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012803{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012804 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012805 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
12806 writer->data = PyUnicode_DATA(writer->buffer);
12807 writer->kind = PyUnicode_KIND(writer->buffer);
12808}
12809
Victor Stinnerd3f08822012-05-29 12:57:52 +020012810void
12811_PyUnicodeWriter_Init(_PyUnicodeWriter *writer, Py_ssize_t min_length)
Victor Stinner202fdca2012-05-07 12:47:02 +020012812{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012813 memset(writer, 0, sizeof(*writer));
12814#ifdef Py_DEBUG
12815 writer->kind = 5; /* invalid kind */
12816#endif
12817 writer->min_length = Py_MAX(min_length, 100);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012818 writer->overallocate = (min_length > 0);
Victor Stinner202fdca2012-05-07 12:47:02 +020012819}
12820
Victor Stinnerd3f08822012-05-29 12:57:52 +020012821int
12822_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
12823 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020012824{
12825 Py_ssize_t newlen;
12826 PyObject *newbuffer;
12827
Victor Stinnerd3f08822012-05-29 12:57:52 +020012828 assert(length > 0);
12829
Victor Stinner202fdca2012-05-07 12:47:02 +020012830 if (length > PY_SSIZE_T_MAX - writer->pos) {
12831 PyErr_NoMemory();
12832 return -1;
12833 }
12834 newlen = writer->pos + length;
12835
Victor Stinnerd3f08822012-05-29 12:57:52 +020012836 if (writer->buffer == NULL) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012837 if (writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012838 /* overallocate 25% to limit the number of resize */
12839 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12840 newlen += newlen / 4;
12841 if (newlen < writer->min_length)
12842 newlen = writer->min_length;
12843 }
12844 writer->buffer = PyUnicode_New(newlen, maxchar);
12845 if (writer->buffer == NULL)
12846 return -1;
12847 _PyUnicodeWriter_Update(writer);
12848 return 0;
12849 }
Victor Stinner202fdca2012-05-07 12:47:02 +020012850
Victor Stinnerd3f08822012-05-29 12:57:52 +020012851 if (newlen > writer->size) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012852 if (writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012853 /* overallocate 25% to limit the number of resize */
12854 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12855 newlen += newlen / 4;
12856 if (newlen < writer->min_length)
12857 newlen = writer->min_length;
12858 }
12859
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012860 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020012861 /* resize + widen */
12862 newbuffer = PyUnicode_New(newlen, maxchar);
12863 if (newbuffer == NULL)
12864 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012865 _PyUnicode_FastCopyCharacters(newbuffer, 0,
12866 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020012867 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012868 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020012869 }
12870 else {
12871 newbuffer = resize_compact(writer->buffer, newlen);
12872 if (newbuffer == NULL)
12873 return -1;
12874 }
12875 writer->buffer = newbuffer;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012876 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012877 }
12878 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012879 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012880 newbuffer = PyUnicode_New(writer->size, maxchar);
12881 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020012882 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012883 _PyUnicode_FastCopyCharacters(newbuffer, 0,
12884 writer->buffer, 0, writer->pos);
12885 Py_DECREF(writer->buffer);
12886 writer->buffer = newbuffer;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012887 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012888 }
12889 return 0;
12890}
12891
Victor Stinnerd3f08822012-05-29 12:57:52 +020012892int
12893_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
12894{
12895 Py_UCS4 maxchar;
12896 Py_ssize_t len;
12897
12898 if (PyUnicode_READY(str) == -1)
12899 return -1;
12900 len = PyUnicode_GET_LENGTH(str);
12901 if (len == 0)
12902 return 0;
12903 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
12904 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012905 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012906 Py_INCREF(str);
12907 writer->buffer = str;
12908 _PyUnicodeWriter_Update(writer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012909 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012910 writer->size = 0;
12911 writer->pos += len;
12912 return 0;
12913 }
12914 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
12915 return -1;
12916 }
12917 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
12918 str, 0, len);
12919 writer->pos += len;
12920 return 0;
12921}
12922
12923PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012924_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012925{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012926 if (writer->pos == 0) {
12927 Py_XDECREF(writer->buffer);
12928 Py_INCREF(unicode_empty);
12929 return unicode_empty;
12930 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012931 if (writer->readonly) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012932 assert(PyUnicode_GET_LENGTH(writer->buffer) == writer->pos);
12933 return writer->buffer;
12934 }
12935 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
12936 PyObject *newbuffer;
12937 newbuffer = resize_compact(writer->buffer, writer->pos);
12938 if (newbuffer == NULL) {
12939 Py_DECREF(writer->buffer);
12940 return NULL;
12941 }
12942 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020012943 }
Victor Stinnerf59c28c2012-05-09 03:24:14 +020012944 assert(_PyUnicode_CheckConsistency(writer->buffer, 1));
Victor Stinner202fdca2012-05-07 12:47:02 +020012945 return writer->buffer;
12946}
12947
Victor Stinnerd3f08822012-05-29 12:57:52 +020012948void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012949_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012950{
12951 Py_CLEAR(writer->buffer);
12952}
12953
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012954#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012955
12956PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012957 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012958\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012959Return a formatted version of S, using substitutions from args and kwargs.\n\
12960The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012961
Eric Smith27bbca62010-11-04 17:06:58 +000012962PyDoc_STRVAR(format_map__doc__,
12963 "S.format_map(mapping) -> str\n\
12964\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012965Return a formatted version of S, using substitutions from mapping.\n\
12966The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012967
Eric Smith4a7d76d2008-05-30 18:10:19 +000012968static PyObject *
12969unicode__format__(PyObject* self, PyObject* args)
12970{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012971 PyObject *format_spec;
12972 _PyUnicodeWriter writer;
12973 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012974
12975 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12976 return NULL;
12977
Victor Stinnerd3f08822012-05-29 12:57:52 +020012978 if (PyUnicode_READY(self) == -1)
12979 return NULL;
12980 _PyUnicodeWriter_Init(&writer, 0);
12981 ret = _PyUnicode_FormatAdvancedWriter(&writer,
12982 self, format_spec, 0,
12983 PyUnicode_GET_LENGTH(format_spec));
12984 if (ret == -1) {
12985 _PyUnicodeWriter_Dealloc(&writer);
12986 return NULL;
12987 }
12988 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000012989}
12990
Eric Smith8c663262007-08-25 02:26:07 +000012991PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012992 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012993\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012994Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012995
12996static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012997unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012998{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012999 Py_ssize_t size;
13000
13001 /* If it's a compact object, account for base structure +
13002 character data. */
13003 if (PyUnicode_IS_COMPACT_ASCII(v))
13004 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13005 else if (PyUnicode_IS_COMPACT(v))
13006 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013007 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013008 else {
13009 /* If it is a two-block object, account for base object, and
13010 for character block if present. */
13011 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013012 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013013 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013014 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013015 }
13016 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013017 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013018 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013019 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013020 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013021 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013022
13023 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013024}
13025
13026PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013027 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013028
13029static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013030unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013031{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013032 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013033 if (!copy)
13034 return NULL;
13035 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013036}
13037
Guido van Rossumd57fd912000-03-10 22:53:23 +000013038static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013039 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013040 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013041 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13042 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013043 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13044 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013045 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013046 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13047 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13048 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13049 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
13050 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013051 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013052 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13053 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13054 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013055 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013056 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13057 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13058 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013059 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013060 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010013061 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013062 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013063 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13064 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13065 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13066 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13067 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13068 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13069 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13070 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13071 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13072 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13073 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13074 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13075 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13076 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013077 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013078 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013079 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013080 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013081 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013082 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000013083 {"maketrans", (PyCFunction) unicode_maketrans,
13084 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013085 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013086#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013087 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013088 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013089#endif
13090
Benjamin Peterson14339b62009-01-31 16:36:08 +000013091 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013092 {NULL, NULL}
13093};
13094
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013095static PyObject *
13096unicode_mod(PyObject *v, PyObject *w)
13097{
Brian Curtindfc80e32011-08-10 20:28:54 -050013098 if (!PyUnicode_Check(v))
13099 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013100 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013101}
13102
13103static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013104 0, /*nb_add*/
13105 0, /*nb_subtract*/
13106 0, /*nb_multiply*/
13107 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013108};
13109
Guido van Rossumd57fd912000-03-10 22:53:23 +000013110static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013111 (lenfunc) unicode_length, /* sq_length */
13112 PyUnicode_Concat, /* sq_concat */
13113 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13114 (ssizeargfunc) unicode_getitem, /* sq_item */
13115 0, /* sq_slice */
13116 0, /* sq_ass_item */
13117 0, /* sq_ass_slice */
13118 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013119};
13120
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013121static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013122unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013123{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013124 if (PyUnicode_READY(self) == -1)
13125 return NULL;
13126
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013127 if (PyIndex_Check(item)) {
13128 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013129 if (i == -1 && PyErr_Occurred())
13130 return NULL;
13131 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013132 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013133 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013134 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013135 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013136 PyObject *result;
13137 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013138 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013139 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013140
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013141 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013142 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013143 return NULL;
13144 }
13145
13146 if (slicelength <= 0) {
Victor Stinner382955f2011-12-11 21:44:00 +010013147 Py_INCREF(unicode_empty);
13148 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013149 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013150 slicelength == PyUnicode_GET_LENGTH(self)) {
13151 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013152 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013153 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013154 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013155 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013156 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013157 src_kind = PyUnicode_KIND(self);
13158 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013159 if (!PyUnicode_IS_ASCII(self)) {
13160 kind_limit = kind_maxchar_limit(src_kind);
13161 max_char = 0;
13162 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13163 ch = PyUnicode_READ(src_kind, src_data, cur);
13164 if (ch > max_char) {
13165 max_char = ch;
13166 if (max_char >= kind_limit)
13167 break;
13168 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013169 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013170 }
Victor Stinner55c99112011-10-13 01:17:06 +020013171 else
13172 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013173 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013174 if (result == NULL)
13175 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013176 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013177 dest_data = PyUnicode_DATA(result);
13178
13179 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013180 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13181 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013182 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013183 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013184 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013185 } else {
13186 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13187 return NULL;
13188 }
13189}
13190
13191static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013192 (lenfunc)unicode_length, /* mp_length */
13193 (binaryfunc)unicode_subscript, /* mp_subscript */
13194 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013195};
13196
Guido van Rossumd57fd912000-03-10 22:53:23 +000013197
Guido van Rossumd57fd912000-03-10 22:53:23 +000013198/* Helpers for PyUnicode_Format() */
13199
13200static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000013201getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013202{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013203 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013204 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013205 (*p_argidx)++;
13206 if (arglen < 0)
13207 return args;
13208 else
13209 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013210 }
13211 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013212 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013213 return NULL;
13214}
13215
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013216/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013217
Victor Stinnerd3f08822012-05-29 12:57:52 +020013218static int
13219formatfloat(PyObject *v, int flags, int prec, int type,
13220 PyObject **p_output, _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013221{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013222 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013223 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013224 Py_ssize_t len;
Tim Petersced69f82003-09-16 20:30:58 +000013225
Guido van Rossumd57fd912000-03-10 22:53:23 +000013226 x = PyFloat_AsDouble(v);
13227 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013228 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013229
Guido van Rossumd57fd912000-03-10 22:53:23 +000013230 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013231 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013232
Eric Smith0923d1d2009-04-16 20:16:10 +000013233 p = PyOS_double_to_string(x, type, prec,
13234 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013235 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013236 return -1;
13237 len = strlen(p);
13238 if (writer) {
13239 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13240 return -1;
Victor Stinner184252a2012-06-16 02:57:41 +020013241 unicode_write_cstr(writer->buffer, writer->pos, p, len);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013242 writer->pos += len;
13243 }
13244 else
13245 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013246 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013247 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013248}
13249
Victor Stinnerd0880d52012-04-27 23:40:13 +020013250/* formatlong() emulates the format codes d, u, o, x and X, and
13251 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13252 * Python's regular ints.
13253 * Return value: a new PyUnicodeObject*, or NULL if error.
13254 * The output string is of the form
13255 * "-"? ("0x" | "0X")? digit+
13256 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13257 * set in flags. The case of hex digits will be correct,
13258 * There will be at least prec digits, zero-filled on the left if
13259 * necessary to get that many.
13260 * val object to be converted
13261 * flags bitmask of format flags; only F_ALT is looked at
13262 * prec minimum number of digits; 0-fill on left if needed
13263 * type a character in [duoxX]; u acts the same as d
13264 *
13265 * CAUTION: o, x and X conversions on regular ints can never
13266 * produce a '-' sign, but can for Python's unbounded ints.
13267 */
Tim Peters38fd5b62000-09-21 05:43:11 +000013268static PyObject*
13269formatlong(PyObject *val, int flags, int prec, int type)
13270{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013271 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013272 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013273 Py_ssize_t i;
13274 int sign; /* 1 if '-', else 0 */
13275 int len; /* number of characters */
13276 Py_ssize_t llen;
13277 int numdigits; /* len == numnondigits + numdigits */
13278 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000013279
Victor Stinnerd0880d52012-04-27 23:40:13 +020013280 /* Avoid exceeding SSIZE_T_MAX */
13281 if (prec > INT_MAX-3) {
13282 PyErr_SetString(PyExc_OverflowError,
13283 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013284 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013285 }
13286
13287 assert(PyLong_Check(val));
13288
13289 switch (type) {
13290 case 'd':
13291 case 'u':
13292 /* Special-case boolean: we want 0/1 */
Victor Stinnerb11d91d2012-04-28 00:25:34 +020013293 if (PyBool_Check(val))
13294 result = PyNumber_ToBase(val, 10);
13295 else
13296 result = Py_TYPE(val)->tp_str(val);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013297 break;
13298 case 'o':
13299 numnondigits = 2;
13300 result = PyNumber_ToBase(val, 8);
13301 break;
13302 case 'x':
13303 case 'X':
13304 numnondigits = 2;
13305 result = PyNumber_ToBase(val, 16);
13306 break;
13307 default:
13308 assert(!"'type' not in [duoxX]");
13309 }
13310 if (!result)
13311 return NULL;
13312
13313 assert(unicode_modifiable(result));
13314 assert(PyUnicode_IS_READY(result));
13315 assert(PyUnicode_IS_ASCII(result));
13316
13317 /* To modify the string in-place, there can only be one reference. */
13318 if (Py_REFCNT(result) != 1) {
13319 PyErr_BadInternalCall();
13320 return NULL;
13321 }
13322 buf = PyUnicode_DATA(result);
13323 llen = PyUnicode_GET_LENGTH(result);
13324 if (llen > INT_MAX) {
13325 PyErr_SetString(PyExc_ValueError,
13326 "string too large in _PyBytes_FormatLong");
13327 return NULL;
13328 }
13329 len = (int)llen;
13330 sign = buf[0] == '-';
13331 numnondigits += sign;
13332 numdigits = len - numnondigits;
13333 assert(numdigits > 0);
13334
13335 /* Get rid of base marker unless F_ALT */
13336 if (((flags & F_ALT) == 0 &&
13337 (type == 'o' || type == 'x' || type == 'X'))) {
13338 assert(buf[sign] == '0');
13339 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13340 buf[sign+1] == 'o');
13341 numnondigits -= 2;
13342 buf += 2;
13343 len -= 2;
13344 if (sign)
13345 buf[0] = '-';
13346 assert(len == numnondigits + numdigits);
13347 assert(numdigits > 0);
13348 }
13349
13350 /* Fill with leading zeroes to meet minimum width. */
13351 if (prec > numdigits) {
13352 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13353 numnondigits + prec);
13354 char *b1;
13355 if (!r1) {
13356 Py_DECREF(result);
13357 return NULL;
13358 }
13359 b1 = PyBytes_AS_STRING(r1);
13360 for (i = 0; i < numnondigits; ++i)
13361 *b1++ = *buf++;
13362 for (i = 0; i < prec - numdigits; i++)
13363 *b1++ = '0';
13364 for (i = 0; i < numdigits; i++)
13365 *b1++ = *buf++;
13366 *b1 = '\0';
13367 Py_DECREF(result);
13368 result = r1;
13369 buf = PyBytes_AS_STRING(result);
13370 len = numnondigits + prec;
13371 }
13372
13373 /* Fix up case for hex conversions. */
13374 if (type == 'X') {
13375 /* Need to convert all lower case letters to upper case.
13376 and need to convert 0x to 0X (and -0x to -0X). */
13377 for (i = 0; i < len; i++)
13378 if (buf[i] >= 'a' && buf[i] <= 'x')
13379 buf[i] -= 'a'-'A';
13380 }
13381 if (!PyUnicode_Check(result) || len != PyUnicode_GET_LENGTH(result)) {
13382 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013383 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013384 Py_DECREF(result);
13385 result = unicode;
13386 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013387 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013388}
13389
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013390static Py_UCS4
13391formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013392{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013393 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013394 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013395 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013396 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013397 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013398 goto onError;
13399 }
13400 else {
13401 /* Integer input truncated to a character */
13402 long x;
13403 x = PyLong_AsLong(v);
13404 if (x == -1 && PyErr_Occurred())
13405 goto onError;
13406
Victor Stinner8faf8212011-12-08 22:14:11 +010013407 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013408 PyErr_SetString(PyExc_OverflowError,
13409 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013410 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013411 }
13412
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013413 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013414 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013415
Benjamin Peterson29060642009-01-31 22:14:21 +000013416 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013417 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013418 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013419 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013420}
13421
Alexander Belopolsky40018472011-02-26 01:02:56 +000013422PyObject *
13423PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013424{
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013425 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013426 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013427 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013428 PyObject *temp = NULL;
13429 PyObject *second = NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013430 PyObject *uformat;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013431 void *fmt;
13432 enum PyUnicode_Kind kind, fmtkind;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013433 _PyUnicodeWriter writer;
Victor Stinneree4544c2012-05-09 22:24:08 +020013434 Py_ssize_t sublen;
13435 Py_UCS4 maxchar;
Tim Petersced69f82003-09-16 20:30:58 +000013436
Guido van Rossumd57fd912000-03-10 22:53:23 +000013437 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013438 PyErr_BadInternalCall();
13439 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013440 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013441 uformat = PyUnicode_FromObject(format);
Benjamin Peterson22a29702012-01-02 09:00:30 -060013442 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013443 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060013444 if (PyUnicode_READY(uformat) == -1)
13445 Py_DECREF(uformat);
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013446
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013447 fmt = PyUnicode_DATA(uformat);
13448 fmtkind = PyUnicode_KIND(uformat);
13449 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13450 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013451
Victor Stinnerd3f08822012-05-29 12:57:52 +020013452 _PyUnicodeWriter_Init(&writer, fmtcnt + 100);
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013453
Guido van Rossumd57fd912000-03-10 22:53:23 +000013454 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013455 arglen = PyTuple_Size(args);
13456 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013457 }
13458 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013459 arglen = -1;
13460 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013461 }
Christian Heimes90aa7642007-12-19 02:45:37 +000013462 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000013463 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013464 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013465
13466 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013467 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013468 Py_ssize_t nonfmtpos;
13469 nonfmtpos = fmtpos++;
13470 while (fmtcnt >= 0 &&
13471 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13472 fmtpos++;
13473 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013474 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013475 if (fmtcnt < 0)
13476 fmtpos--;
Victor Stinneree4544c2012-05-09 22:24:08 +020013477 sublen = fmtpos - nonfmtpos;
13478 maxchar = _PyUnicode_FindMaxChar(uformat,
13479 nonfmtpos, nonfmtpos + sublen);
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013480 if (_PyUnicodeWriter_Prepare(&writer, sublen, maxchar) == -1)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013481 goto onError;
Victor Stinneree4544c2012-05-09 22:24:08 +020013482
Victor Stinnerd3f08822012-05-29 12:57:52 +020013483 _PyUnicode_FastCopyCharacters(writer.buffer, writer.pos,
13484 uformat, nonfmtpos, sublen);
Victor Stinneree4544c2012-05-09 22:24:08 +020013485 writer.pos += sublen;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013486 }
13487 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013488 /* Got a format specifier */
13489 int flags = 0;
13490 Py_ssize_t width = -1;
13491 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013492 Py_UCS4 c = '\0';
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013493 Py_UCS4 fill;
13494 int sign;
13495 Py_UCS4 signchar;
Benjamin Peterson29060642009-01-31 22:14:21 +000013496 int isnumok;
13497 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013498 void *pbuf = NULL;
13499 Py_ssize_t pindex, len;
Victor Stinneree4544c2012-05-09 22:24:08 +020013500 Py_UCS4 bufmaxchar;
13501 Py_ssize_t buflen;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013502
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013503 fmtpos++;
Victor Stinner438106b2012-05-02 00:41:57 +020013504 c = PyUnicode_READ(fmtkind, fmt, fmtpos);
13505 if (c == '(') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013506 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013507 Py_ssize_t keylen;
13508 PyObject *key;
13509 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013510
Benjamin Peterson29060642009-01-31 22:14:21 +000013511 if (dict == NULL) {
13512 PyErr_SetString(PyExc_TypeError,
13513 "format requires a mapping");
13514 goto onError;
13515 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013516 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013517 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013518 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013519 /* Skip over balanced parentheses */
13520 while (pcount > 0 && --fmtcnt >= 0) {
Victor Stinnerbff7c962012-05-03 01:44:59 +020013521 c = PyUnicode_READ(fmtkind, fmt, fmtpos);
13522 if (c == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013523 --pcount;
Victor Stinnerbff7c962012-05-03 01:44:59 +020013524 else if (c == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013525 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013526 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013527 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013528 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013529 if (fmtcnt < 0 || pcount > 0) {
13530 PyErr_SetString(PyExc_ValueError,
13531 "incomplete format key");
13532 goto onError;
13533 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013534 key = PyUnicode_Substring(uformat,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013535 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013536 if (key == NULL)
13537 goto onError;
13538 if (args_owned) {
13539 Py_DECREF(args);
13540 args_owned = 0;
13541 }
13542 args = PyObject_GetItem(dict, key);
13543 Py_DECREF(key);
13544 if (args == NULL) {
13545 goto onError;
13546 }
13547 args_owned = 1;
13548 arglen = -1;
13549 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013550 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013551 while (--fmtcnt >= 0) {
Victor Stinner438106b2012-05-02 00:41:57 +020013552 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13553 switch (c) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013554 case '-': flags |= F_LJUST; continue;
13555 case '+': flags |= F_SIGN; continue;
13556 case ' ': flags |= F_BLANK; continue;
13557 case '#': flags |= F_ALT; continue;
13558 case '0': flags |= F_ZERO; continue;
13559 }
13560 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013561 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013562 if (c == '*') {
13563 v = getnextarg(args, arglen, &argidx);
13564 if (v == NULL)
13565 goto onError;
13566 if (!PyLong_Check(v)) {
13567 PyErr_SetString(PyExc_TypeError,
13568 "* wants int");
13569 goto onError;
13570 }
13571 width = PyLong_AsLong(v);
13572 if (width == -1 && PyErr_Occurred())
13573 goto onError;
13574 if (width < 0) {
13575 flags |= F_LJUST;
13576 width = -width;
13577 }
13578 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013579 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013580 }
13581 else if (c >= '0' && c <= '9') {
13582 width = c - '0';
13583 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013584 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013585 if (c < '0' || c > '9')
13586 break;
Martin v. Löwisb05c0732012-05-15 13:45:49 +020013587 /* Since c is unsigned, the RHS would end up as unsigned,
13588 mixing signed and unsigned comparison. Since c is between
13589 '0' and '9', casting to int is safe. */
13590 if (width > (PY_SSIZE_T_MAX - ((int)c - '0')) / 10) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013591 PyErr_SetString(PyExc_ValueError,
13592 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013593 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013594 }
13595 width = width*10 + (c - '0');
13596 }
13597 }
13598 if (c == '.') {
13599 prec = 0;
13600 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013601 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013602 if (c == '*') {
13603 v = getnextarg(args, arglen, &argidx);
13604 if (v == NULL)
13605 goto onError;
13606 if (!PyLong_Check(v)) {
13607 PyErr_SetString(PyExc_TypeError,
13608 "* wants int");
13609 goto onError;
13610 }
13611 prec = PyLong_AsLong(v);
13612 if (prec == -1 && PyErr_Occurred())
13613 goto onError;
13614 if (prec < 0)
13615 prec = 0;
13616 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013617 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013618 }
13619 else if (c >= '0' && c <= '9') {
13620 prec = c - '0';
13621 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013622 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013623 if (c < '0' || c > '9')
13624 break;
Martin v. Löwisb05c0732012-05-15 13:45:49 +020013625 if (prec > (INT_MAX - ((int)c - '0')) / 10) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013626 PyErr_SetString(PyExc_ValueError,
13627 "prec too big");
13628 goto onError;
13629 }
13630 prec = prec*10 + (c - '0');
13631 }
13632 }
13633 } /* prec */
13634 if (fmtcnt >= 0) {
13635 if (c == 'h' || c == 'l' || c == 'L') {
13636 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013637 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013638 }
13639 }
13640 if (fmtcnt < 0) {
13641 PyErr_SetString(PyExc_ValueError,
13642 "incomplete format");
13643 goto onError;
13644 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020013645 if (fmtcnt == 0)
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013646 writer.overallocate = 0;
Victor Stinneraff3cc62012-04-30 05:19:21 +020013647
13648 if (c == '%') {
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013649 if (_PyUnicodeWriter_Prepare(&writer, 1, '%') == -1)
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013650 goto onError;
Victor Stinneree4544c2012-05-09 22:24:08 +020013651 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '%');
13652 writer.pos += 1;
Victor Stinneraff3cc62012-04-30 05:19:21 +020013653 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013654 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020013655
Victor Stinneraff3cc62012-04-30 05:19:21 +020013656 v = getnextarg(args, arglen, &argidx);
13657 if (v == NULL)
13658 goto onError;
13659
Benjamin Peterson29060642009-01-31 22:14:21 +000013660 sign = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013661 signchar = '\0';
Benjamin Peterson29060642009-01-31 22:14:21 +000013662 fill = ' ';
13663 switch (c) {
13664
Benjamin Peterson29060642009-01-31 22:14:21 +000013665 case 's':
13666 case 'r':
13667 case 'a':
Victor Stinnerd3f08822012-05-29 12:57:52 +020013668 if (PyLong_CheckExact(v) && width == -1 && prec == -1) {
13669 /* Fast path */
13670 if (_PyLong_FormatWriter(&writer, v, 10, flags & F_ALT) == -1)
13671 goto onError;
13672 goto nextarg;
13673 }
13674
Victor Stinner808fc0a2010-03-22 12:50:40 +000013675 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013676 temp = v;
13677 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013678 }
13679 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013680 if (c == 's')
13681 temp = PyObject_Str(v);
13682 else if (c == 'r')
13683 temp = PyObject_Repr(v);
13684 else
13685 temp = PyObject_ASCII(v);
Benjamin Peterson29060642009-01-31 22:14:21 +000013686 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013687 break;
13688
13689 case 'i':
13690 case 'd':
13691 case 'u':
13692 case 'o':
13693 case 'x':
13694 case 'X':
Victor Stinnerd3f08822012-05-29 12:57:52 +020013695 if (PyLong_CheckExact(v)
13696 && width == -1 && prec == -1
13697 && !(flags & (F_SIGN | F_BLANK)))
13698 {
13699 /* Fast path */
13700 switch(c)
13701 {
13702 case 'd':
13703 case 'i':
13704 case 'u':
13705 if (_PyLong_FormatWriter(&writer, v, 10, flags & F_ALT) == -1)
13706 goto onError;
13707 goto nextarg;
13708 case 'x':
13709 if (_PyLong_FormatWriter(&writer, v, 16, flags & F_ALT) == -1)
13710 goto onError;
13711 goto nextarg;
13712 case 'o':
13713 if (_PyLong_FormatWriter(&writer, v, 8, flags & F_ALT) == -1)
13714 goto onError;
13715 goto nextarg;
13716 default:
13717 break;
13718 }
13719 }
13720
Benjamin Peterson29060642009-01-31 22:14:21 +000013721 isnumok = 0;
13722 if (PyNumber_Check(v)) {
13723 PyObject *iobj=NULL;
13724
13725 if (PyLong_Check(v)) {
13726 iobj = v;
13727 Py_INCREF(iobj);
13728 }
13729 else {
13730 iobj = PyNumber_Long(v);
13731 }
13732 if (iobj!=NULL) {
13733 if (PyLong_Check(iobj)) {
13734 isnumok = 1;
Victor Stinneraff3cc62012-04-30 05:19:21 +020013735 sign = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013736 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013737 Py_DECREF(iobj);
Benjamin Peterson29060642009-01-31 22:14:21 +000013738 }
13739 else {
13740 Py_DECREF(iobj);
13741 }
13742 }
13743 }
13744 if (!isnumok) {
13745 PyErr_Format(PyExc_TypeError,
13746 "%%%c format: a number is required, "
13747 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13748 goto onError;
13749 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013750 if (flags & F_ZERO)
Benjamin Peterson29060642009-01-31 22:14:21 +000013751 fill = '0';
13752 break;
13753
13754 case 'e':
13755 case 'E':
13756 case 'f':
13757 case 'F':
13758 case 'g':
13759 case 'G':
Victor Stinnerd3f08822012-05-29 12:57:52 +020013760 if (width == -1 && prec == -1
13761 && !(flags & (F_SIGN | F_BLANK)))
13762 {
13763 /* Fast path */
13764 if (formatfloat(v, flags, prec, c, NULL, &writer) == -1)
13765 goto onError;
13766 goto nextarg;
13767 }
13768
Benjamin Peterson29060642009-01-31 22:14:21 +000013769 sign = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013770 if (flags & F_ZERO)
Benjamin Peterson29060642009-01-31 22:14:21 +000013771 fill = '0';
Victor Stinnerd3f08822012-05-29 12:57:52 +020013772 if (formatfloat(v, flags, prec, c, &temp, NULL) == -1)
13773 temp = NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000013774 break;
13775
13776 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013777 {
13778 Py_UCS4 ch = formatchar(v);
13779 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013780 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013781 if (width == -1 && prec == -1) {
13782 /* Fast path */
13783 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
13784 goto onError;
13785 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
13786 writer.pos += 1;
13787 goto nextarg;
13788 }
Victor Stinnerb5c3ea32012-05-02 00:29:36 +020013789 temp = PyUnicode_FromOrdinal(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +000013790 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013791 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013792
13793 default:
13794 PyErr_Format(PyExc_ValueError,
13795 "unsupported format character '%c' (0x%x) "
13796 "at index %zd",
13797 (31<=c && c<=126) ? (char)c : '?',
13798 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013799 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013800 goto onError;
13801 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020013802 if (temp == NULL)
13803 goto onError;
13804 assert (PyUnicode_Check(temp));
Victor Stinnerd3f08822012-05-29 12:57:52 +020013805
13806 if (width == -1 && prec == -1
13807 && !(flags & (F_SIGN | F_BLANK)))
13808 {
13809 /* Fast path */
13810 if (_PyUnicodeWriter_WriteStr(&writer, temp) == -1)
13811 goto onError;
13812 goto nextarg;
13813 }
13814
Victor Stinneraff3cc62012-04-30 05:19:21 +020013815 if (PyUnicode_READY(temp) == -1) {
13816 Py_CLEAR(temp);
13817 goto onError;
13818 }
13819 kind = PyUnicode_KIND(temp);
13820 pbuf = PyUnicode_DATA(temp);
13821 len = PyUnicode_GET_LENGTH(temp);
13822
13823 if (c == 's' || c == 'r' || c == 'a') {
13824 if (prec >= 0 && len > prec)
13825 len = prec;
13826 }
13827
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013828 /* pbuf is initialized here. */
13829 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013830 if (sign) {
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013831 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
13832 if (ch == '-' || ch == '+') {
13833 signchar = ch;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013834 len--;
13835 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013836 }
13837 else if (flags & F_SIGN)
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013838 signchar = '+';
Benjamin Peterson29060642009-01-31 22:14:21 +000013839 else if (flags & F_BLANK)
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013840 signchar = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +000013841 else
13842 sign = 0;
13843 }
13844 if (width < len)
13845 width = len;
Victor Stinneree4544c2012-05-09 22:24:08 +020013846
13847 /* Compute the length and maximum character of the
13848 written characters */
13849 bufmaxchar = 127;
13850 if (!(flags & F_LJUST)) {
13851 if (sign) {
13852 if ((width-1) > len)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013853 bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill);
Victor Stinneree4544c2012-05-09 22:24:08 +020013854 }
13855 else {
13856 if (width > len)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013857 bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill);
Victor Stinneree4544c2012-05-09 22:24:08 +020013858 }
13859 }
13860 maxchar = _PyUnicode_FindMaxChar(temp, 0, pindex+len);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013861 bufmaxchar = MAX_MAXCHAR(bufmaxchar, maxchar);
Victor Stinneree4544c2012-05-09 22:24:08 +020013862
13863 buflen = width;
13864 if (sign && len == width)
13865 buflen++;
13866
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013867 if (_PyUnicodeWriter_Prepare(&writer, buflen, bufmaxchar) == -1)
Victor Stinneree4544c2012-05-09 22:24:08 +020013868 goto onError;
13869
13870 /* Write characters */
Benjamin Peterson29060642009-01-31 22:14:21 +000013871 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013872 if (fill != ' ') {
Victor Stinneree4544c2012-05-09 22:24:08 +020013873 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, signchar);
13874 writer.pos += 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013875 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013876 if (width > len)
13877 width--;
13878 }
13879 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013880 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013881 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013882 if (fill != ' ') {
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013883 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '0');
13884 PyUnicode_WRITE(writer.kind, writer.data, writer.pos+1, c);
13885 writer.pos += 2;
13886 pindex += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +000013887 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013888 width -= 2;
13889 if (width < 0)
13890 width = 0;
13891 len -= 2;
13892 }
13893 if (width > len && !(flags & F_LJUST)) {
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013894 sublen = width - len;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013895 FILL(writer.kind, writer.data, fill, writer.pos, sublen);
13896 writer.pos += sublen;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013897 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013898 }
13899 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013900 if (sign) {
Victor Stinneree4544c2012-05-09 22:24:08 +020013901 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, signchar);
13902 writer.pos += 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013903 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013904 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013905 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13906 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013907 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '0');
13908 PyUnicode_WRITE(writer.kind, writer.data, writer.pos+1, c);
13909 writer.pos += 2;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013910 pindex += 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013911 }
13912 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013913
Victor Stinnerc9d369f2012-06-16 02:22:37 +020013914 if (len) {
13915 _PyUnicode_FastCopyCharacters(writer.buffer, writer.pos,
13916 temp, pindex, len);
13917 writer.pos += len;
13918 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013919 if (width > len) {
Victor Stinneree4544c2012-05-09 22:24:08 +020013920 sublen = width - len;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013921 FILL(writer.kind, writer.data, ' ', writer.pos, sublen);
13922 writer.pos += sublen;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013923 }
Victor Stinneree4544c2012-05-09 22:24:08 +020013924
Victor Stinnerd3f08822012-05-29 12:57:52 +020013925nextarg:
Benjamin Peterson29060642009-01-31 22:14:21 +000013926 if (dict && (argidx < arglen) && c != '%') {
13927 PyErr_SetString(PyExc_TypeError,
13928 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013929 goto onError;
13930 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013931 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013932 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013933 } /* until end */
13934 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013935 PyErr_SetString(PyExc_TypeError,
13936 "not all arguments converted during string formatting");
13937 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013938 }
13939
13940 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013941 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013942 }
13943 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013944 Py_XDECREF(temp);
13945 Py_XDECREF(second);
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013946 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013947
Benjamin Peterson29060642009-01-31 22:14:21 +000013948 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013949 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013950 Py_XDECREF(temp);
13951 Py_XDECREF(second);
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013952 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013953 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013954 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013955 }
13956 return NULL;
13957}
13958
Jeremy Hylton938ace62002-07-17 16:30:39 +000013959static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013960unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13961
Tim Peters6d6c1a32001-08-02 04:15:00 +000013962static PyObject *
13963unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13964{
Benjamin Peterson29060642009-01-31 22:14:21 +000013965 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013966 static char *kwlist[] = {"object", "encoding", "errors", 0};
13967 char *encoding = NULL;
13968 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013969
Benjamin Peterson14339b62009-01-31 16:36:08 +000013970 if (type != &PyUnicode_Type)
13971 return unicode_subtype_new(type, args, kwds);
13972 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013973 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013974 return NULL;
Victor Stinner382955f2011-12-11 21:44:00 +010013975 if (x == NULL) {
13976 Py_INCREF(unicode_empty);
13977 return unicode_empty;
13978 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013979 if (encoding == NULL && errors == NULL)
13980 return PyObject_Str(x);
13981 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013982 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013983}
13984
Guido van Rossume023fe02001-08-30 03:12:59 +000013985static PyObject *
13986unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13987{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013988 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013989 Py_ssize_t length, char_size;
13990 int share_wstr, share_utf8;
13991 unsigned int kind;
13992 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013993
Benjamin Peterson14339b62009-01-31 16:36:08 +000013994 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013995
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013996 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013997 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013998 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013999 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014000 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014001 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014002 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014003 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014004
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014005 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014006 if (self == NULL) {
14007 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014008 return NULL;
14009 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014010 kind = PyUnicode_KIND(unicode);
14011 length = PyUnicode_GET_LENGTH(unicode);
14012
14013 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014014#ifdef Py_DEBUG
14015 _PyUnicode_HASH(self) = -1;
14016#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014017 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014018#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014019 _PyUnicode_STATE(self).interned = 0;
14020 _PyUnicode_STATE(self).kind = kind;
14021 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014022 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014023 _PyUnicode_STATE(self).ready = 1;
14024 _PyUnicode_WSTR(self) = NULL;
14025 _PyUnicode_UTF8_LENGTH(self) = 0;
14026 _PyUnicode_UTF8(self) = NULL;
14027 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014028 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014029
14030 share_utf8 = 0;
14031 share_wstr = 0;
14032 if (kind == PyUnicode_1BYTE_KIND) {
14033 char_size = 1;
14034 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14035 share_utf8 = 1;
14036 }
14037 else if (kind == PyUnicode_2BYTE_KIND) {
14038 char_size = 2;
14039 if (sizeof(wchar_t) == 2)
14040 share_wstr = 1;
14041 }
14042 else {
14043 assert(kind == PyUnicode_4BYTE_KIND);
14044 char_size = 4;
14045 if (sizeof(wchar_t) == 4)
14046 share_wstr = 1;
14047 }
14048
14049 /* Ensure we won't overflow the length. */
14050 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14051 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014052 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014053 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014054 data = PyObject_MALLOC((length + 1) * char_size);
14055 if (data == NULL) {
14056 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014057 goto onError;
14058 }
14059
Victor Stinnerc3c74152011-10-02 20:39:55 +020014060 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014061 if (share_utf8) {
14062 _PyUnicode_UTF8_LENGTH(self) = length;
14063 _PyUnicode_UTF8(self) = data;
14064 }
14065 if (share_wstr) {
14066 _PyUnicode_WSTR_LENGTH(self) = length;
14067 _PyUnicode_WSTR(self) = (wchar_t *)data;
14068 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014069
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014070 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014071 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014072 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014073#ifdef Py_DEBUG
14074 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14075#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014076 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014077 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014078
14079onError:
14080 Py_DECREF(unicode);
14081 Py_DECREF(self);
14082 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014083}
14084
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014085PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000014086 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014087\n\
Collin Winterd474ce82007-08-07 19:42:11 +000014088Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000014089encoding defaults to the current default string encoding.\n\
14090errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014091
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014092static PyObject *unicode_iter(PyObject *seq);
14093
Guido van Rossumd57fd912000-03-10 22:53:23 +000014094PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014095 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014096 "str", /* tp_name */
14097 sizeof(PyUnicodeObject), /* tp_size */
14098 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014099 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014100 (destructor)unicode_dealloc, /* tp_dealloc */
14101 0, /* tp_print */
14102 0, /* tp_getattr */
14103 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014104 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014105 unicode_repr, /* tp_repr */
14106 &unicode_as_number, /* tp_as_number */
14107 &unicode_as_sequence, /* tp_as_sequence */
14108 &unicode_as_mapping, /* tp_as_mapping */
14109 (hashfunc) unicode_hash, /* tp_hash*/
14110 0, /* tp_call*/
14111 (reprfunc) unicode_str, /* tp_str */
14112 PyObject_GenericGetAttr, /* tp_getattro */
14113 0, /* tp_setattro */
14114 0, /* tp_as_buffer */
14115 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014116 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014117 unicode_doc, /* tp_doc */
14118 0, /* tp_traverse */
14119 0, /* tp_clear */
14120 PyUnicode_RichCompare, /* tp_richcompare */
14121 0, /* tp_weaklistoffset */
14122 unicode_iter, /* tp_iter */
14123 0, /* tp_iternext */
14124 unicode_methods, /* tp_methods */
14125 0, /* tp_members */
14126 0, /* tp_getset */
14127 &PyBaseObject_Type, /* tp_base */
14128 0, /* tp_dict */
14129 0, /* tp_descr_get */
14130 0, /* tp_descr_set */
14131 0, /* tp_dictoffset */
14132 0, /* tp_init */
14133 0, /* tp_alloc */
14134 unicode_new, /* tp_new */
14135 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014136};
14137
14138/* Initialize the Unicode implementation */
14139
Victor Stinner3a50e702011-10-18 21:21:00 +020014140int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014141{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014142 int i;
14143
Thomas Wouters477c8d52006-05-27 19:21:47 +000014144 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014145 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014146 0x000A, /* LINE FEED */
14147 0x000D, /* CARRIAGE RETURN */
14148 0x001C, /* FILE SEPARATOR */
14149 0x001D, /* GROUP SEPARATOR */
14150 0x001E, /* RECORD SEPARATOR */
14151 0x0085, /* NEXT LINE */
14152 0x2028, /* LINE SEPARATOR */
14153 0x2029, /* PARAGRAPH SEPARATOR */
14154 };
14155
Fred Drakee4315f52000-05-09 19:53:39 +000014156 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020014157 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014158 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014159 Py_FatalError("Can't create empty string");
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010014160 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014161
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014162 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000014163 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000014164 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014165 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014166
14167 /* initialize the linebreak bloom filter */
14168 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014169 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014170 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014171
14172 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020014173
14174#ifdef HAVE_MBCS
14175 winver.dwOSVersionInfoSize = sizeof(winver);
14176 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14177 PyErr_SetFromWindowsErr(0);
14178 return -1;
14179 }
14180#endif
14181 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014182}
14183
14184/* Finalize the Unicode implementation */
14185
Christian Heimesa156e092008-02-16 07:38:31 +000014186int
14187PyUnicode_ClearFreeList(void)
14188{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014189 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014190}
14191
Guido van Rossumd57fd912000-03-10 22:53:23 +000014192void
Thomas Wouters78890102000-07-22 19:25:51 +000014193_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014194{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014195 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014196
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000014197 Py_XDECREF(unicode_empty);
14198 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014199
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014200 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014201 if (unicode_latin1[i]) {
14202 Py_DECREF(unicode_latin1[i]);
14203 unicode_latin1[i] = NULL;
14204 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014205 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014206 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014207 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014208}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014209
Walter Dörwald16807132007-05-25 13:52:07 +000014210void
14211PyUnicode_InternInPlace(PyObject **p)
14212{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014213 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014214 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014215#ifdef Py_DEBUG
14216 assert(s != NULL);
14217 assert(_PyUnicode_CHECK(s));
14218#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014219 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014220 return;
14221#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014222 /* If it's a subclass, we don't really know what putting
14223 it in the interned dict might do. */
14224 if (!PyUnicode_CheckExact(s))
14225 return;
14226 if (PyUnicode_CHECK_INTERNED(s))
14227 return;
14228 if (interned == NULL) {
14229 interned = PyDict_New();
14230 if (interned == NULL) {
14231 PyErr_Clear(); /* Don't leave an exception */
14232 return;
14233 }
14234 }
14235 /* It might be that the GetItem call fails even
14236 though the key is present in the dictionary,
14237 namely when this happens during a stack overflow. */
14238 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014239 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014240 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014241
Benjamin Peterson29060642009-01-31 22:14:21 +000014242 if (t) {
14243 Py_INCREF(t);
14244 Py_DECREF(*p);
14245 *p = t;
14246 return;
14247 }
Walter Dörwald16807132007-05-25 13:52:07 +000014248
Benjamin Peterson14339b62009-01-31 16:36:08 +000014249 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010014250 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014251 PyErr_Clear();
14252 PyThreadState_GET()->recursion_critical = 0;
14253 return;
14254 }
14255 PyThreadState_GET()->recursion_critical = 0;
14256 /* The two references in interned are not counted by refcnt.
14257 The deallocator will take care of this */
14258 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014259 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014260}
14261
14262void
14263PyUnicode_InternImmortal(PyObject **p)
14264{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014265 PyUnicode_InternInPlace(p);
14266 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014267 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014268 Py_INCREF(*p);
14269 }
Walter Dörwald16807132007-05-25 13:52:07 +000014270}
14271
14272PyObject *
14273PyUnicode_InternFromString(const char *cp)
14274{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014275 PyObject *s = PyUnicode_FromString(cp);
14276 if (s == NULL)
14277 return NULL;
14278 PyUnicode_InternInPlace(&s);
14279 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014280}
14281
Alexander Belopolsky40018472011-02-26 01:02:56 +000014282void
14283_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014284{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014285 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014286 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014287 Py_ssize_t i, n;
14288 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014289
Benjamin Peterson14339b62009-01-31 16:36:08 +000014290 if (interned == NULL || !PyDict_Check(interned))
14291 return;
14292 keys = PyDict_Keys(interned);
14293 if (keys == NULL || !PyList_Check(keys)) {
14294 PyErr_Clear();
14295 return;
14296 }
Walter Dörwald16807132007-05-25 13:52:07 +000014297
Benjamin Peterson14339b62009-01-31 16:36:08 +000014298 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14299 detector, interned unicode strings are not forcibly deallocated;
14300 rather, we give them their stolen references back, and then clear
14301 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014302
Benjamin Peterson14339b62009-01-31 16:36:08 +000014303 n = PyList_GET_SIZE(keys);
14304 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014305 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014306 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014307 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014308 if (PyUnicode_READY(s) == -1) {
14309 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014310 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014311 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014312 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014313 case SSTATE_NOT_INTERNED:
14314 /* XXX Shouldn't happen */
14315 break;
14316 case SSTATE_INTERNED_IMMORTAL:
14317 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014318 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014319 break;
14320 case SSTATE_INTERNED_MORTAL:
14321 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014322 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014323 break;
14324 default:
14325 Py_FatalError("Inconsistent interned string state.");
14326 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014327 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014328 }
14329 fprintf(stderr, "total size of all interned strings: "
14330 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14331 "mortal/immortal\n", mortal_size, immortal_size);
14332 Py_DECREF(keys);
14333 PyDict_Clear(interned);
14334 Py_DECREF(interned);
14335 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000014336}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014337
14338
14339/********************* Unicode Iterator **************************/
14340
14341typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014342 PyObject_HEAD
14343 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014344 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014345} unicodeiterobject;
14346
14347static void
14348unicodeiter_dealloc(unicodeiterobject *it)
14349{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014350 _PyObject_GC_UNTRACK(it);
14351 Py_XDECREF(it->it_seq);
14352 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014353}
14354
14355static int
14356unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14357{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014358 Py_VISIT(it->it_seq);
14359 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014360}
14361
14362static PyObject *
14363unicodeiter_next(unicodeiterobject *it)
14364{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014365 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014366
Benjamin Peterson14339b62009-01-31 16:36:08 +000014367 assert(it != NULL);
14368 seq = it->it_seq;
14369 if (seq == NULL)
14370 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014371 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014372
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014373 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14374 int kind = PyUnicode_KIND(seq);
14375 void *data = PyUnicode_DATA(seq);
14376 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14377 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014378 if (item != NULL)
14379 ++it->it_index;
14380 return item;
14381 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014382
Benjamin Peterson14339b62009-01-31 16:36:08 +000014383 Py_DECREF(seq);
14384 it->it_seq = NULL;
14385 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014386}
14387
14388static PyObject *
14389unicodeiter_len(unicodeiterobject *it)
14390{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014391 Py_ssize_t len = 0;
14392 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014393 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014394 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014395}
14396
14397PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14398
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014399static PyObject *
14400unicodeiter_reduce(unicodeiterobject *it)
14401{
14402 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020014403 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014404 it->it_seq, it->it_index);
14405 } else {
14406 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
14407 if (u == NULL)
14408 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020014409 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014410 }
14411}
14412
14413PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14414
14415static PyObject *
14416unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
14417{
14418 Py_ssize_t index = PyLong_AsSsize_t(state);
14419 if (index == -1 && PyErr_Occurred())
14420 return NULL;
14421 if (index < 0)
14422 index = 0;
14423 it->it_index = index;
14424 Py_RETURN_NONE;
14425}
14426
14427PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14428
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014429static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014430 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014431 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014432 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
14433 reduce_doc},
14434 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
14435 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014436 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014437};
14438
14439PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014440 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14441 "str_iterator", /* tp_name */
14442 sizeof(unicodeiterobject), /* tp_basicsize */
14443 0, /* tp_itemsize */
14444 /* methods */
14445 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14446 0, /* tp_print */
14447 0, /* tp_getattr */
14448 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014449 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014450 0, /* tp_repr */
14451 0, /* tp_as_number */
14452 0, /* tp_as_sequence */
14453 0, /* tp_as_mapping */
14454 0, /* tp_hash */
14455 0, /* tp_call */
14456 0, /* tp_str */
14457 PyObject_GenericGetAttr, /* tp_getattro */
14458 0, /* tp_setattro */
14459 0, /* tp_as_buffer */
14460 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14461 0, /* tp_doc */
14462 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14463 0, /* tp_clear */
14464 0, /* tp_richcompare */
14465 0, /* tp_weaklistoffset */
14466 PyObject_SelfIter, /* tp_iter */
14467 (iternextfunc)unicodeiter_next, /* tp_iternext */
14468 unicodeiter_methods, /* tp_methods */
14469 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014470};
14471
14472static PyObject *
14473unicode_iter(PyObject *seq)
14474{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014475 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014476
Benjamin Peterson14339b62009-01-31 16:36:08 +000014477 if (!PyUnicode_Check(seq)) {
14478 PyErr_BadInternalCall();
14479 return NULL;
14480 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014481 if (PyUnicode_READY(seq) == -1)
14482 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014483 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14484 if (it == NULL)
14485 return NULL;
14486 it->it_index = 0;
14487 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014488 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014489 _PyObject_GC_TRACK(it);
14490 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014491}
14492
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014493
14494size_t
14495Py_UNICODE_strlen(const Py_UNICODE *u)
14496{
14497 int res = 0;
14498 while(*u++)
14499 res++;
14500 return res;
14501}
14502
14503Py_UNICODE*
14504Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14505{
14506 Py_UNICODE *u = s1;
14507 while ((*u++ = *s2++));
14508 return s1;
14509}
14510
14511Py_UNICODE*
14512Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14513{
14514 Py_UNICODE *u = s1;
14515 while ((*u++ = *s2++))
14516 if (n-- == 0)
14517 break;
14518 return s1;
14519}
14520
14521Py_UNICODE*
14522Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14523{
14524 Py_UNICODE *u1 = s1;
14525 u1 += Py_UNICODE_strlen(u1);
14526 Py_UNICODE_strcpy(u1, s2);
14527 return s1;
14528}
14529
14530int
14531Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14532{
14533 while (*s1 && *s2 && *s1 == *s2)
14534 s1++, s2++;
14535 if (*s1 && *s2)
14536 return (*s1 < *s2) ? -1 : +1;
14537 if (*s1)
14538 return 1;
14539 if (*s2)
14540 return -1;
14541 return 0;
14542}
14543
14544int
14545Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14546{
14547 register Py_UNICODE u1, u2;
14548 for (; n != 0; n--) {
14549 u1 = *s1;
14550 u2 = *s2;
14551 if (u1 != u2)
14552 return (u1 < u2) ? -1 : +1;
14553 if (u1 == '\0')
14554 return 0;
14555 s1++;
14556 s2++;
14557 }
14558 return 0;
14559}
14560
14561Py_UNICODE*
14562Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14563{
14564 const Py_UNICODE *p;
14565 for (p = s; *p; p++)
14566 if (*p == c)
14567 return (Py_UNICODE*)p;
14568 return NULL;
14569}
14570
14571Py_UNICODE*
14572Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14573{
14574 const Py_UNICODE *p;
14575 p = s + Py_UNICODE_strlen(s);
14576 while (p != s) {
14577 p--;
14578 if (*p == c)
14579 return (Py_UNICODE*)p;
14580 }
14581 return NULL;
14582}
Victor Stinner331ea922010-08-10 16:37:20 +000014583
Victor Stinner71133ff2010-09-01 23:43:53 +000014584Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014585PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014586{
Victor Stinner577db2c2011-10-11 22:12:48 +020014587 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014588 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014589
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014590 if (!PyUnicode_Check(unicode)) {
14591 PyErr_BadArgument();
14592 return NULL;
14593 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014594 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014595 if (u == NULL)
14596 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014597 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014598 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014599 PyErr_NoMemory();
14600 return NULL;
14601 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014602 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014603 size *= sizeof(Py_UNICODE);
14604 copy = PyMem_Malloc(size);
14605 if (copy == NULL) {
14606 PyErr_NoMemory();
14607 return NULL;
14608 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014609 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014610 return copy;
14611}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014612
Georg Brandl66c221e2010-10-14 07:04:07 +000014613/* A _string module, to export formatter_parser and formatter_field_name_split
14614 to the string.Formatter class implemented in Python. */
14615
14616static PyMethodDef _string_methods[] = {
14617 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14618 METH_O, PyDoc_STR("split the argument as a field name")},
14619 {"formatter_parser", (PyCFunction) formatter_parser,
14620 METH_O, PyDoc_STR("parse the argument as a format string")},
14621 {NULL, NULL}
14622};
14623
14624static struct PyModuleDef _string_module = {
14625 PyModuleDef_HEAD_INIT,
14626 "_string",
14627 PyDoc_STR("string helper module"),
14628 0,
14629 _string_methods,
14630 NULL,
14631 NULL,
14632 NULL,
14633 NULL
14634};
14635
14636PyMODINIT_FUNC
14637PyInit__string(void)
14638{
14639 return PyModule_Create(&_string_module);
14640}
14641
14642
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014643#ifdef __cplusplus
14644}
14645#endif