blob: 0722312373d6394a35313bebe6f9b99c62937b0f [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Endianness switches; defaults to little endian */
51
52#ifdef WORDS_BIGENDIAN
53# define BYTEORDER_IS_BIG_ENDIAN
54#else
55# define BYTEORDER_IS_LITTLE_ENDIAN
56#endif
57
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000058/* --- Globals ------------------------------------------------------------
59
60 The globals are initialized by the _PyUnicode_Init() API and should
61 not be used before calling that API.
62
63*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000064
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000065
66#ifdef __cplusplus
67extern "C" {
68#endif
69
Victor Stinner8faf8212011-12-08 22:14:11 +010070/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
71#define MAX_UNICODE 0x10ffff
72
Victor Stinner910337b2011-10-03 03:20:16 +020073#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020074# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020075#else
76# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
77#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020078
Victor Stinnere90fe6a2011-10-01 16:48:13 +020079#define _PyUnicode_UTF8(op) \
80 (((PyCompactUnicodeObject*)(op))->utf8)
81#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020082 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020083 assert(PyUnicode_IS_READY(op)), \
84 PyUnicode_IS_COMPACT_ASCII(op) ? \
85 ((char*)((PyASCIIObject*)(op) + 1)) : \
86 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020087#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020088 (((PyCompactUnicodeObject*)(op))->utf8_length)
89#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020090 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020091 assert(PyUnicode_IS_READY(op)), \
92 PyUnicode_IS_COMPACT_ASCII(op) ? \
93 ((PyASCIIObject*)(op))->length : \
94 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020095#define _PyUnicode_WSTR(op) \
96 (((PyASCIIObject*)(op))->wstr)
97#define _PyUnicode_WSTR_LENGTH(op) \
98 (((PyCompactUnicodeObject*)(op))->wstr_length)
99#define _PyUnicode_LENGTH(op) \
100 (((PyASCIIObject *)(op))->length)
101#define _PyUnicode_STATE(op) \
102 (((PyASCIIObject *)(op))->state)
103#define _PyUnicode_HASH(op) \
104 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200105#define _PyUnicode_KIND(op) \
106 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200107 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200108#define _PyUnicode_GET_LENGTH(op) \
109 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200110 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200111#define _PyUnicode_DATA_ANY(op) \
112 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200113
Victor Stinnere6abb482012-05-02 01:15:40 +0200114/* Optimized version of Py_MAX() to compute the maximum character:
115 use it when your are computing the second argument of PyUnicode_New() */
116#define MAX_MAXCHAR(maxchar1, maxchar2) \
117 ((maxchar1) | (maxchar2))
118
Victor Stinner910337b2011-10-03 03:20:16 +0200119#undef PyUnicode_READY
120#define PyUnicode_READY(op) \
121 (assert(_PyUnicode_CHECK(op)), \
122 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200123 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100124 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200125
Victor Stinnerc379ead2011-10-03 12:52:27 +0200126#define _PyUnicode_SHARE_UTF8(op) \
127 (assert(_PyUnicode_CHECK(op)), \
128 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
129 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
130#define _PyUnicode_SHARE_WSTR(op) \
131 (assert(_PyUnicode_CHECK(op)), \
132 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
133
Victor Stinner829c0ad2011-10-03 01:08:02 +0200134/* true if the Unicode object has an allocated UTF-8 memory block
135 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200136#define _PyUnicode_HAS_UTF8_MEMORY(op) \
137 (assert(_PyUnicode_CHECK(op)), \
138 (!PyUnicode_IS_COMPACT_ASCII(op) \
139 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200140 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
141
Victor Stinner03490912011-10-03 23:45:12 +0200142/* true if the Unicode object has an allocated wstr memory block
143 (not shared with other data) */
144#define _PyUnicode_HAS_WSTR_MEMORY(op) \
145 (assert(_PyUnicode_CHECK(op)), \
146 (_PyUnicode_WSTR(op) && \
147 (!PyUnicode_IS_READY(op) || \
148 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
149
Victor Stinner910337b2011-10-03 03:20:16 +0200150/* Generic helper macro to convert characters of different types.
151 from_type and to_type have to be valid type names, begin and end
152 are pointers to the source characters which should be of type
153 "from_type *". to is a pointer of type "to_type *" and points to the
154 buffer where the result characters are written to. */
155#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
156 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200157 to_type *_to = (to_type *) to; \
158 const from_type *_iter = (begin); \
159 const from_type *_end = (end); \
160 Py_ssize_t n = (_end) - (_iter); \
161 const from_type *_unrolled_end = \
162 _iter + (n & ~ (Py_ssize_t) 3); \
163 while (_iter < (_unrolled_end)) { \
164 _to[0] = (to_type) _iter[0]; \
165 _to[1] = (to_type) _iter[1]; \
166 _to[2] = (to_type) _iter[2]; \
167 _to[3] = (to_type) _iter[3]; \
168 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200169 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200170 while (_iter < (_end)) \
171 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200172 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200173
Walter Dörwald16807132007-05-25 13:52:07 +0000174/* This dictionary holds all interned unicode strings. Note that references
175 to strings in this dictionary are *not* counted in the string's ob_refcnt.
176 When the interned string reaches a refcnt of 0 the string deallocation
177 function will delete the reference from this dictionary.
178
179 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000180 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000181*/
182static PyObject *interned;
183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200185static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000186
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200187/* List of static strings. */
188static _Py_Identifier *static_strings;
189
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190/* Single character Unicode strings in the Latin-1 range are being
191 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200192static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193
Christian Heimes190d79e2008-01-30 11:58:22 +0000194/* Fast detection of the most frequent whitespace characters */
195const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000196 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000197/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000198/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000199/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000200/* case 0x000C: * FORM FEED */
201/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000202 0, 1, 1, 1, 1, 1, 0, 0,
203 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000204/* case 0x001C: * FILE SEPARATOR */
205/* case 0x001D: * GROUP SEPARATOR */
206/* case 0x001E: * RECORD SEPARATOR */
207/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000208 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000209/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 1, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000214
Benjamin Peterson14339b62009-01-31 16:36:08 +0000215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
218 0, 0, 0, 0, 0, 0, 0, 0,
219 0, 0, 0, 0, 0, 0, 0, 0,
220 0, 0, 0, 0, 0, 0, 0, 0,
221 0, 0, 0, 0, 0, 0, 0, 0,
222 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000223};
224
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200225/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200226static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200227static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200228static void copy_characters(
229 PyObject *to, Py_ssize_t to_start,
230 PyObject *from, Py_ssize_t from_start,
231 Py_ssize_t how_many);
Victor Stinner488fa492011-12-12 00:01:39 +0100232static int unicode_modifiable(PyObject *unicode);
233
Victor Stinnerfe226c02011-10-03 03:52:20 +0200234
Alexander Belopolsky40018472011-02-26 01:02:56 +0000235static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200236unicode_fromascii(const unsigned char *s, Py_ssize_t size);
237static PyObject *
238_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
239static PyObject *
240_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
241static PyObject *
242_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
243
244static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000245unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000246 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100247 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000248 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
249
Alexander Belopolsky40018472011-02-26 01:02:56 +0000250static void
251raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300252 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100253 PyObject *unicode,
254 Py_ssize_t startpos, Py_ssize_t endpos,
255 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000256
Christian Heimes190d79e2008-01-30 11:58:22 +0000257/* Same for linebreaks */
258static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000259 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000260/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000261/* 0x000B, * LINE TABULATION */
262/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000263/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000264 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000265 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000266/* 0x001C, * FILE SEPARATOR */
267/* 0x001D, * GROUP SEPARATOR */
268/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000269 0, 0, 0, 0, 1, 1, 1, 0,
270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000274
Benjamin Peterson14339b62009-01-31 16:36:08 +0000275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
277 0, 0, 0, 0, 0, 0, 0, 0,
278 0, 0, 0, 0, 0, 0, 0, 0,
279 0, 0, 0, 0, 0, 0, 0, 0,
280 0, 0, 0, 0, 0, 0, 0, 0,
281 0, 0, 0, 0, 0, 0, 0, 0,
282 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000283};
284
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300285/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
286 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000287Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000288PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000289{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000290#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000291 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000292#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000293 /* This is actually an illegal character, so it should
294 not be passed to unichr. */
295 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000296#endif
297}
298
Victor Stinner910337b2011-10-03 03:20:16 +0200299#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200300int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100301_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200302{
303 PyASCIIObject *ascii;
304 unsigned int kind;
305
306 assert(PyUnicode_Check(op));
307
308 ascii = (PyASCIIObject *)op;
309 kind = ascii->state.kind;
310
Victor Stinnera3b334d2011-10-03 13:53:37 +0200311 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200312 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200313 assert(ascii->state.ready == 1);
314 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200315 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200316 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200317 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200318
Victor Stinnera41463c2011-10-04 01:05:08 +0200319 if (ascii->state.compact == 1) {
320 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200321 assert(kind == PyUnicode_1BYTE_KIND
322 || kind == PyUnicode_2BYTE_KIND
323 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200324 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200325 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200326 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100327 }
328 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200329 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
330
331 data = unicode->data.any;
332 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100333 assert(ascii->length == 0);
334 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200335 assert(ascii->state.compact == 0);
336 assert(ascii->state.ascii == 0);
337 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100338 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200339 assert(ascii->wstr != NULL);
340 assert(data == NULL);
341 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200342 }
343 else {
344 assert(kind == PyUnicode_1BYTE_KIND
345 || kind == PyUnicode_2BYTE_KIND
346 || kind == PyUnicode_4BYTE_KIND);
347 assert(ascii->state.compact == 0);
348 assert(ascii->state.ready == 1);
349 assert(data != NULL);
350 if (ascii->state.ascii) {
351 assert (compact->utf8 == data);
352 assert (compact->utf8_length == ascii->length);
353 }
354 else
355 assert (compact->utf8 != data);
356 }
357 }
358 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200359 if (
360#if SIZEOF_WCHAR_T == 2
361 kind == PyUnicode_2BYTE_KIND
362#else
363 kind == PyUnicode_4BYTE_KIND
364#endif
365 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200366 {
367 assert(ascii->wstr == data);
368 assert(compact->wstr_length == ascii->length);
369 } else
370 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200371 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200372
373 if (compact->utf8 == NULL)
374 assert(compact->utf8_length == 0);
375 if (ascii->wstr == NULL)
376 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200377 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200378 /* check that the best kind is used */
379 if (check_content && kind != PyUnicode_WCHAR_KIND)
380 {
381 Py_ssize_t i;
382 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200383 void *data;
384 Py_UCS4 ch;
385
386 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200387 for (i=0; i < ascii->length; i++)
388 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200389 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200390 if (ch > maxchar)
391 maxchar = ch;
392 }
393 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100394 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200395 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100396 assert(maxchar <= 255);
397 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200398 else
399 assert(maxchar < 128);
400 }
Victor Stinner77faf692011-11-20 18:56:05 +0100401 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200402 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100403 assert(maxchar <= 0xFFFF);
404 }
405 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200406 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100407 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100408 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200409 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200410 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400411 return 1;
412}
Victor Stinner910337b2011-10-03 03:20:16 +0200413#endif
414
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100415static PyObject*
416unicode_result_wchar(PyObject *unicode)
417{
418#ifndef Py_DEBUG
419 Py_ssize_t len;
420
421 assert(Py_REFCNT(unicode) == 1);
422
423 len = _PyUnicode_WSTR_LENGTH(unicode);
424 if (len == 0) {
425 Py_INCREF(unicode_empty);
426 Py_DECREF(unicode);
427 return unicode_empty;
428 }
429
430 if (len == 1) {
431 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
432 if (ch < 256) {
433 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
434 Py_DECREF(unicode);
435 return latin1_char;
436 }
437 }
438
439 if (_PyUnicode_Ready(unicode) < 0) {
440 Py_XDECREF(unicode);
441 return NULL;
442 }
443#else
444 /* don't make the result ready in debug mode to ensure that the caller
445 makes the string ready before using it */
446 assert(_PyUnicode_CheckConsistency(unicode, 1));
447#endif
448 return unicode;
449}
450
451static PyObject*
452unicode_result_ready(PyObject *unicode)
453{
454 Py_ssize_t length;
455
456 length = PyUnicode_GET_LENGTH(unicode);
457 if (length == 0) {
458 if (unicode != unicode_empty) {
459 Py_INCREF(unicode_empty);
460 Py_DECREF(unicode);
461 }
462 return unicode_empty;
463 }
464
465 if (length == 1) {
466 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
467 if (ch < 256) {
468 PyObject *latin1_char = unicode_latin1[ch];
469 if (latin1_char != NULL) {
470 if (unicode != latin1_char) {
471 Py_INCREF(latin1_char);
472 Py_DECREF(unicode);
473 }
474 return latin1_char;
475 }
476 else {
477 assert(_PyUnicode_CheckConsistency(unicode, 1));
478 Py_INCREF(unicode);
479 unicode_latin1[ch] = unicode;
480 return unicode;
481 }
482 }
483 }
484
485 assert(_PyUnicode_CheckConsistency(unicode, 1));
486 return unicode;
487}
488
489static PyObject*
490unicode_result(PyObject *unicode)
491{
492 assert(_PyUnicode_CHECK(unicode));
493 if (PyUnicode_IS_READY(unicode))
494 return unicode_result_ready(unicode);
495 else
496 return unicode_result_wchar(unicode);
497}
498
Victor Stinnerc4b49542011-12-11 22:44:26 +0100499static PyObject*
500unicode_result_unchanged(PyObject *unicode)
501{
502 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500503 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100504 return NULL;
505 Py_INCREF(unicode);
506 return unicode;
507 }
508 else
509 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100510 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100511}
512
Victor Stinner3a50e702011-10-18 21:21:00 +0200513#ifdef HAVE_MBCS
514static OSVERSIONINFOEX winver;
515#endif
516
Thomas Wouters477c8d52006-05-27 19:21:47 +0000517/* --- Bloom Filters ----------------------------------------------------- */
518
519/* stuff to implement simple "bloom filters" for Unicode characters.
520 to keep things simple, we use a single bitmask, using the least 5
521 bits from each unicode characters as the bit index. */
522
523/* the linebreak mask is set up by Unicode_Init below */
524
Antoine Pitrouf068f942010-01-13 14:19:12 +0000525#if LONG_BIT >= 128
526#define BLOOM_WIDTH 128
527#elif LONG_BIT >= 64
528#define BLOOM_WIDTH 64
529#elif LONG_BIT >= 32
530#define BLOOM_WIDTH 32
531#else
532#error "LONG_BIT is smaller than 32"
533#endif
534
Thomas Wouters477c8d52006-05-27 19:21:47 +0000535#define BLOOM_MASK unsigned long
536
537static BLOOM_MASK bloom_linebreak;
538
Antoine Pitrouf068f942010-01-13 14:19:12 +0000539#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
540#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000541
Benjamin Peterson29060642009-01-31 22:14:21 +0000542#define BLOOM_LINEBREAK(ch) \
543 ((ch) < 128U ? ascii_linebreak[(ch)] : \
544 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000545
Alexander Belopolsky40018472011-02-26 01:02:56 +0000546Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200547make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000548{
549 /* calculate simple bloom-style bitmask for a given unicode string */
550
Antoine Pitrouf068f942010-01-13 14:19:12 +0000551 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000552 Py_ssize_t i;
553
554 mask = 0;
555 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200556 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000557
558 return mask;
559}
560
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200561#define BLOOM_MEMBER(mask, chr, str) \
562 (BLOOM(mask, chr) \
563 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000564
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200565/* Compilation of templated routines */
566
567#include "stringlib/asciilib.h"
568#include "stringlib/fastsearch.h"
569#include "stringlib/partition.h"
570#include "stringlib/split.h"
571#include "stringlib/count.h"
572#include "stringlib/find.h"
573#include "stringlib/find_max_char.h"
574#include "stringlib/localeutil.h"
575#include "stringlib/undef.h"
576
577#include "stringlib/ucs1lib.h"
578#include "stringlib/fastsearch.h"
579#include "stringlib/partition.h"
580#include "stringlib/split.h"
581#include "stringlib/count.h"
582#include "stringlib/find.h"
583#include "stringlib/find_max_char.h"
584#include "stringlib/localeutil.h"
585#include "stringlib/undef.h"
586
587#include "stringlib/ucs2lib.h"
588#include "stringlib/fastsearch.h"
589#include "stringlib/partition.h"
590#include "stringlib/split.h"
591#include "stringlib/count.h"
592#include "stringlib/find.h"
593#include "stringlib/find_max_char.h"
594#include "stringlib/localeutil.h"
595#include "stringlib/undef.h"
596
597#include "stringlib/ucs4lib.h"
598#include "stringlib/fastsearch.h"
599#include "stringlib/partition.h"
600#include "stringlib/split.h"
601#include "stringlib/count.h"
602#include "stringlib/find.h"
603#include "stringlib/find_max_char.h"
604#include "stringlib/localeutil.h"
605#include "stringlib/undef.h"
606
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200607#include "stringlib/unicodedefs.h"
608#include "stringlib/fastsearch.h"
609#include "stringlib/count.h"
610#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100611#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200612
Guido van Rossumd57fd912000-03-10 22:53:23 +0000613/* --- Unicode Object ----------------------------------------------------- */
614
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200615static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200616fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200617
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200618Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
619 Py_ssize_t size, Py_UCS4 ch,
620 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200621{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200622 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
623
624 switch (kind) {
625 case PyUnicode_1BYTE_KIND:
626 {
627 Py_UCS1 ch1 = (Py_UCS1) ch;
628 if (ch1 == ch)
629 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
630 else
631 return -1;
632 }
633 case PyUnicode_2BYTE_KIND:
634 {
635 Py_UCS2 ch2 = (Py_UCS2) ch;
636 if (ch2 == ch)
637 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
638 else
639 return -1;
640 }
641 case PyUnicode_4BYTE_KIND:
642 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
643 default:
644 assert(0);
645 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200646 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200647}
648
Victor Stinnerfe226c02011-10-03 03:52:20 +0200649static PyObject*
650resize_compact(PyObject *unicode, Py_ssize_t length)
651{
652 Py_ssize_t char_size;
653 Py_ssize_t struct_size;
654 Py_ssize_t new_size;
655 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100656 PyObject *new_unicode;
Victor Stinner79891572012-05-03 13:43:07 +0200657 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200658 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100659 assert(PyUnicode_IS_COMPACT(unicode));
660
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200661 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100662 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200663 struct_size = sizeof(PyASCIIObject);
664 else
665 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200666 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200667
Victor Stinnerfe226c02011-10-03 03:52:20 +0200668 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
669 PyErr_NoMemory();
670 return NULL;
671 }
672 new_size = (struct_size + (length + 1) * char_size);
673
Victor Stinner84def372011-12-11 20:04:56 +0100674 _Py_DEC_REFTOTAL;
675 _Py_ForgetReference(unicode);
676
677 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
678 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100679 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200680 PyErr_NoMemory();
681 return NULL;
682 }
Victor Stinner84def372011-12-11 20:04:56 +0100683 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200684 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100685
Victor Stinnerfe226c02011-10-03 03:52:20 +0200686 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200687 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200688 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100689 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200690 _PyUnicode_WSTR_LENGTH(unicode) = length;
691 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200692 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
693 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200694 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200695 return unicode;
696}
697
Alexander Belopolsky40018472011-02-26 01:02:56 +0000698static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200699resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000700{
Victor Stinner95663112011-10-04 01:03:50 +0200701 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100702 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200703 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200704 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000705
Victor Stinnerfe226c02011-10-03 03:52:20 +0200706 if (PyUnicode_IS_READY(unicode)) {
707 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200708 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200709 void *data;
710
711 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200712 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200713 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
714 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200715
716 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
717 PyErr_NoMemory();
718 return -1;
719 }
720 new_size = (length + 1) * char_size;
721
Victor Stinner7a9105a2011-12-12 00:13:42 +0100722 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
723 {
724 PyObject_DEL(_PyUnicode_UTF8(unicode));
725 _PyUnicode_UTF8(unicode) = NULL;
726 _PyUnicode_UTF8_LENGTH(unicode) = 0;
727 }
728
Victor Stinnerfe226c02011-10-03 03:52:20 +0200729 data = (PyObject *)PyObject_REALLOC(data, new_size);
730 if (data == NULL) {
731 PyErr_NoMemory();
732 return -1;
733 }
734 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200735 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200736 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200737 _PyUnicode_WSTR_LENGTH(unicode) = length;
738 }
739 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200740 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200741 _PyUnicode_UTF8_LENGTH(unicode) = length;
742 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200743 _PyUnicode_LENGTH(unicode) = length;
744 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200745 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200746 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200747 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200748 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200749 }
Victor Stinner95663112011-10-04 01:03:50 +0200750 assert(_PyUnicode_WSTR(unicode) != NULL);
751
752 /* check for integer overflow */
753 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
754 PyErr_NoMemory();
755 return -1;
756 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100757 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200758 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100759 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200760 if (!wstr) {
761 PyErr_NoMemory();
762 return -1;
763 }
764 _PyUnicode_WSTR(unicode) = wstr;
765 _PyUnicode_WSTR(unicode)[length] = 0;
766 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200767 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000768 return 0;
769}
770
Victor Stinnerfe226c02011-10-03 03:52:20 +0200771static PyObject*
772resize_copy(PyObject *unicode, Py_ssize_t length)
773{
774 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100775 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200776 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100777
Benjamin Petersonbac79492012-01-14 13:34:47 -0500778 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100779 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200780
781 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
782 if (copy == NULL)
783 return NULL;
784
785 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200786 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200787 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200788 }
789 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200790 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100791
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200792 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200793 if (w == NULL)
794 return NULL;
795 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
796 copy_length = Py_MIN(copy_length, length);
797 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
798 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200799 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200800 }
801}
802
Guido van Rossumd57fd912000-03-10 22:53:23 +0000803/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000804 Ux0000 terminated; some code (e.g. new_identifier)
805 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000806
807 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000808 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000809
810*/
811
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200812#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200813static int unicode_old_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200814#endif
815
Alexander Belopolsky40018472011-02-26 01:02:56 +0000816static PyUnicodeObject *
817_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000818{
819 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200820 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000821
Thomas Wouters477c8d52006-05-27 19:21:47 +0000822 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000823 if (length == 0 && unicode_empty != NULL) {
824 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200825 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000826 }
827
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000828 /* Ensure we won't overflow the size. */
829 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
830 return (PyUnicodeObject *)PyErr_NoMemory();
831 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200832 if (length < 0) {
833 PyErr_SetString(PyExc_SystemError,
834 "Negative size passed to _PyUnicode_New");
835 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000836 }
837
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200838#ifdef Py_DEBUG
839 ++unicode_old_new_calls;
840#endif
841
842 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
843 if (unicode == NULL)
844 return NULL;
845 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
846 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
847 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100848 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000849 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100850 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000851 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200852
Jeremy Hyltond8082792003-09-16 19:41:39 +0000853 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000854 * the caller fails before initializing str -- unicode_resize()
855 * reads str[0], and the Keep-Alive optimization can keep memory
856 * allocated for str alive across a call to unicode_dealloc(unicode).
857 * We don't want unicode_resize to read uninitialized memory in
858 * that case.
859 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200860 _PyUnicode_WSTR(unicode)[0] = 0;
861 _PyUnicode_WSTR(unicode)[length] = 0;
862 _PyUnicode_WSTR_LENGTH(unicode) = length;
863 _PyUnicode_HASH(unicode) = -1;
864 _PyUnicode_STATE(unicode).interned = 0;
865 _PyUnicode_STATE(unicode).kind = 0;
866 _PyUnicode_STATE(unicode).compact = 0;
867 _PyUnicode_STATE(unicode).ready = 0;
868 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200869 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200870 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200871 _PyUnicode_UTF8(unicode) = NULL;
872 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100873 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000874 return unicode;
875}
876
Victor Stinnerf42dc442011-10-02 23:33:16 +0200877static const char*
878unicode_kind_name(PyObject *unicode)
879{
Victor Stinner42dfd712011-10-03 14:41:45 +0200880 /* don't check consistency: unicode_kind_name() is called from
881 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200882 if (!PyUnicode_IS_COMPACT(unicode))
883 {
884 if (!PyUnicode_IS_READY(unicode))
885 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600886 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200887 {
888 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200889 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200890 return "legacy ascii";
891 else
892 return "legacy latin1";
893 case PyUnicode_2BYTE_KIND:
894 return "legacy UCS2";
895 case PyUnicode_4BYTE_KIND:
896 return "legacy UCS4";
897 default:
898 return "<legacy invalid kind>";
899 }
900 }
901 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600902 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200903 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200904 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200905 return "ascii";
906 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200907 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200908 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200909 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200910 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200911 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200912 default:
913 return "<invalid compact kind>";
914 }
915}
916
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200917#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200918static int unicode_new_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200919
920/* Functions wrapping macros for use in debugger */
921char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200922 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200923}
924
925void *_PyUnicode_compact_data(void *unicode) {
926 return _PyUnicode_COMPACT_DATA(unicode);
927}
928void *_PyUnicode_data(void *unicode){
929 printf("obj %p\n", unicode);
930 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
931 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
932 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
933 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
934 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
935 return PyUnicode_DATA(unicode);
936}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200937
938void
939_PyUnicode_Dump(PyObject *op)
940{
941 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200942 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
943 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
944 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200945
Victor Stinnera849a4b2011-10-03 12:12:11 +0200946 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200947 {
948 if (ascii->state.ascii)
949 data = (ascii + 1);
950 else
951 data = (compact + 1);
952 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200953 else
954 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200955 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
956
Victor Stinnera849a4b2011-10-03 12:12:11 +0200957 if (ascii->wstr == data)
958 printf("shared ");
959 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200960
Victor Stinnera3b334d2011-10-03 13:53:37 +0200961 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200962 printf(" (%zu), ", compact->wstr_length);
963 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
964 printf("shared ");
965 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200966 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200967 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200968}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200969#endif
970
971PyObject *
972PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
973{
974 PyObject *obj;
975 PyCompactUnicodeObject *unicode;
976 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +0200977 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200978 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200979 Py_ssize_t char_size;
980 Py_ssize_t struct_size;
981
982 /* Optimization for empty strings */
983 if (size == 0 && unicode_empty != NULL) {
984 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200985 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200986 }
987
988#ifdef Py_DEBUG
989 ++unicode_new_new_calls;
990#endif
991
Victor Stinner9e9d6892011-10-04 01:02:02 +0200992 is_ascii = 0;
993 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200994 struct_size = sizeof(PyCompactUnicodeObject);
995 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +0200996 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200997 char_size = 1;
998 is_ascii = 1;
999 struct_size = sizeof(PyASCIIObject);
1000 }
1001 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001002 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001003 char_size = 1;
1004 }
1005 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001006 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001007 char_size = 2;
1008 if (sizeof(wchar_t) == 2)
1009 is_sharing = 1;
1010 }
1011 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001012 if (maxchar > MAX_UNICODE) {
1013 PyErr_SetString(PyExc_SystemError,
1014 "invalid maximum character passed to PyUnicode_New");
1015 return NULL;
1016 }
Victor Stinner8f825062012-04-27 13:55:39 +02001017 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001018 char_size = 4;
1019 if (sizeof(wchar_t) == 4)
1020 is_sharing = 1;
1021 }
1022
1023 /* Ensure we won't overflow the size. */
1024 if (size < 0) {
1025 PyErr_SetString(PyExc_SystemError,
1026 "Negative size passed to PyUnicode_New");
1027 return NULL;
1028 }
1029 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1030 return PyErr_NoMemory();
1031
1032 /* Duplicated allocation code from _PyObject_New() instead of a call to
1033 * PyObject_New() so we are able to allocate space for the object and
1034 * it's data buffer.
1035 */
1036 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1037 if (obj == NULL)
1038 return PyErr_NoMemory();
1039 obj = PyObject_INIT(obj, &PyUnicode_Type);
1040 if (obj == NULL)
1041 return NULL;
1042
1043 unicode = (PyCompactUnicodeObject *)obj;
1044 if (is_ascii)
1045 data = ((PyASCIIObject*)obj) + 1;
1046 else
1047 data = unicode + 1;
1048 _PyUnicode_LENGTH(unicode) = size;
1049 _PyUnicode_HASH(unicode) = -1;
1050 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001051 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001052 _PyUnicode_STATE(unicode).compact = 1;
1053 _PyUnicode_STATE(unicode).ready = 1;
1054 _PyUnicode_STATE(unicode).ascii = is_ascii;
1055 if (is_ascii) {
1056 ((char*)data)[size] = 0;
1057 _PyUnicode_WSTR(unicode) = NULL;
1058 }
Victor Stinner8f825062012-04-27 13:55:39 +02001059 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001060 ((char*)data)[size] = 0;
1061 _PyUnicode_WSTR(unicode) = NULL;
1062 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001063 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001064 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001065 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001066 else {
1067 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001068 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001069 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001070 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001071 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001072 ((Py_UCS4*)data)[size] = 0;
1073 if (is_sharing) {
1074 _PyUnicode_WSTR_LENGTH(unicode) = size;
1075 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1076 }
1077 else {
1078 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1079 _PyUnicode_WSTR(unicode) = NULL;
1080 }
1081 }
Victor Stinner8f825062012-04-27 13:55:39 +02001082#ifdef Py_DEBUG
1083 /* Fill the data with invalid characters to detect bugs earlier.
1084 _PyUnicode_CheckConsistency(str, 1) detects invalid characters,
1085 at least for ASCII and UCS-4 strings. U+00FF is invalid in ASCII
1086 and U+FFFFFFFF is an invalid character in Unicode 6.0. */
1087 memset(data, 0xff, size * kind);
1088#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001089 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001090 return obj;
1091}
1092
1093#if SIZEOF_WCHAR_T == 2
1094/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1095 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001096 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001097
1098 This function assumes that unicode can hold one more code point than wstr
1099 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001100static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001101unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001102 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001103{
1104 const wchar_t *iter;
1105 Py_UCS4 *ucs4_out;
1106
Victor Stinner910337b2011-10-03 03:20:16 +02001107 assert(unicode != NULL);
1108 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001109 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1110 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1111
1112 for (iter = begin; iter < end; ) {
1113 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1114 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001115 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1116 && (iter+1) < end
1117 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001118 {
Victor Stinner551ac952011-11-29 22:58:13 +01001119 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001120 iter += 2;
1121 }
1122 else {
1123 *ucs4_out++ = *iter;
1124 iter++;
1125 }
1126 }
1127 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1128 _PyUnicode_GET_LENGTH(unicode)));
1129
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001130}
1131#endif
1132
Victor Stinnercd9950f2011-10-02 00:34:53 +02001133static int
Victor Stinner488fa492011-12-12 00:01:39 +01001134unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001135{
Victor Stinner488fa492011-12-12 00:01:39 +01001136 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001137 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001138 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001139 return -1;
1140 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001141 return 0;
1142}
1143
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001144static int
1145_copy_characters(PyObject *to, Py_ssize_t to_start,
1146 PyObject *from, Py_ssize_t from_start,
1147 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001148{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001149 unsigned int from_kind, to_kind;
1150 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001151 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001152
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001153 assert(PyUnicode_Check(from));
1154 assert(PyUnicode_Check(to));
1155 assert(PyUnicode_IS_READY(from));
1156 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001157
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001158 assert(PyUnicode_GET_LENGTH(from) >= how_many);
1159 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1160 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001161
Victor Stinnerf5ca1a22011-09-28 23:54:59 +02001162 if (how_many == 0)
1163 return 0;
1164
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001165 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001166 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001167 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001168 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001169
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001170#ifdef Py_DEBUG
1171 if (!check_maxchar
1172 && (from_kind > to_kind
1173 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001174 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001175 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1176 Py_UCS4 ch;
1177 Py_ssize_t i;
1178 for (i=0; i < how_many; i++) {
1179 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1180 assert(ch <= to_maxchar);
1181 }
1182 }
1183#endif
1184 fast = (from_kind == to_kind);
1185 if (check_maxchar
1186 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1187 {
1188 /* deny latin1 => ascii */
1189 fast = 0;
1190 }
1191
1192 if (fast) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001193 Py_MEMCPY((char*)to_data + to_kind * to_start,
1194 (char*)from_data + from_kind * from_start,
1195 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001196 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001197 else if (from_kind == PyUnicode_1BYTE_KIND
1198 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001199 {
1200 _PyUnicode_CONVERT_BYTES(
1201 Py_UCS1, Py_UCS2,
1202 PyUnicode_1BYTE_DATA(from) + from_start,
1203 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1204 PyUnicode_2BYTE_DATA(to) + to_start
1205 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001206 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001207 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001208 && to_kind == PyUnicode_4BYTE_KIND)
1209 {
1210 _PyUnicode_CONVERT_BYTES(
1211 Py_UCS1, Py_UCS4,
1212 PyUnicode_1BYTE_DATA(from) + from_start,
1213 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1214 PyUnicode_4BYTE_DATA(to) + to_start
1215 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001216 }
1217 else if (from_kind == PyUnicode_2BYTE_KIND
1218 && to_kind == PyUnicode_4BYTE_KIND)
1219 {
1220 _PyUnicode_CONVERT_BYTES(
1221 Py_UCS2, Py_UCS4,
1222 PyUnicode_2BYTE_DATA(from) + from_start,
1223 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1224 PyUnicode_4BYTE_DATA(to) + to_start
1225 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001226 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001227 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001228 /* check if max_char(from substring) <= max_char(to) */
1229 if (from_kind > to_kind
1230 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001231 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001232 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001233 /* slow path to check for character overflow */
1234 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001235 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001236 Py_ssize_t i;
1237
Victor Stinner56c161a2011-10-06 02:47:11 +02001238#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001239 for (i=0; i < how_many; i++) {
1240 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001241 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001242 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1243 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001244#else
1245 if (!check_maxchar) {
1246 for (i=0; i < how_many; i++) {
1247 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1248 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1249 }
1250 }
1251 else {
1252 for (i=0; i < how_many; i++) {
1253 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1254 if (ch > to_maxchar)
1255 return 1;
1256 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1257 }
1258 }
1259#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001260 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001261 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001262 assert(0 && "inconsistent state");
1263 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001264 }
1265 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001266 return 0;
1267}
1268
1269static void
1270copy_characters(PyObject *to, Py_ssize_t to_start,
1271 PyObject *from, Py_ssize_t from_start,
1272 Py_ssize_t how_many)
1273{
1274 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1275}
1276
1277Py_ssize_t
1278PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1279 PyObject *from, Py_ssize_t from_start,
1280 Py_ssize_t how_many)
1281{
1282 int err;
1283
1284 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1285 PyErr_BadInternalCall();
1286 return -1;
1287 }
1288
Benjamin Petersonbac79492012-01-14 13:34:47 -05001289 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001290 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001291 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001292 return -1;
1293
1294 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1295 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1296 PyErr_Format(PyExc_SystemError,
1297 "Cannot write %zi characters at %zi "
1298 "in a string of %zi characters",
1299 how_many, to_start, PyUnicode_GET_LENGTH(to));
1300 return -1;
1301 }
1302
1303 if (how_many == 0)
1304 return 0;
1305
Victor Stinner488fa492011-12-12 00:01:39 +01001306 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001307 return -1;
1308
1309 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1310 if (err) {
1311 PyErr_Format(PyExc_SystemError,
1312 "Cannot copy %s characters "
1313 "into a string of %s characters",
1314 unicode_kind_name(from),
1315 unicode_kind_name(to));
1316 return -1;
1317 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001318 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001319}
1320
Victor Stinner17222162011-09-28 22:15:37 +02001321/* Find the maximum code point and count the number of surrogate pairs so a
1322 correct string length can be computed before converting a string to UCS4.
1323 This function counts single surrogates as a character and not as a pair.
1324
1325 Return 0 on success, or -1 on error. */
1326static int
1327find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1328 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001329{
1330 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001331 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001332
Victor Stinnerc53be962011-10-02 21:33:54 +02001333 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001334 *num_surrogates = 0;
1335 *maxchar = 0;
1336
1337 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001338#if SIZEOF_WCHAR_T == 2
Victor Stinnerca4f2072011-11-22 03:38:40 +01001339 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1340 && (iter+1) < end
1341 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001342 {
Victor Stinner8faf8212011-12-08 22:14:11 +01001343 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001344 ++(*num_surrogates);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001345 iter += 2;
1346 }
1347 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001348#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001349 {
1350 ch = *iter;
1351 iter++;
1352 }
1353 if (ch > *maxchar) {
1354 *maxchar = ch;
1355 if (*maxchar > MAX_UNICODE) {
1356 PyErr_Format(PyExc_ValueError,
1357 "character U+%x is not in range [U+0000; U+10ffff]",
1358 ch);
1359 return -1;
1360 }
1361 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001362 }
1363 return 0;
1364}
1365
1366#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001367static int unicode_ready_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001368#endif
1369
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001370int
1371_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001372{
1373 wchar_t *end;
1374 Py_UCS4 maxchar = 0;
1375 Py_ssize_t num_surrogates;
1376#if SIZEOF_WCHAR_T == 2
1377 Py_ssize_t length_wo_surrogates;
1378#endif
1379
Georg Brandl7597add2011-10-05 16:36:47 +02001380 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001381 strings were created using _PyObject_New() and where no canonical
1382 representation (the str field) has been set yet aka strings
1383 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001384 assert(_PyUnicode_CHECK(unicode));
1385 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001386 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001387 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001388 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001389 /* Actually, it should neither be interned nor be anything else: */
1390 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001391
1392#ifdef Py_DEBUG
1393 ++unicode_ready_calls;
1394#endif
1395
1396 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001397 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001398 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001399 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001400
1401 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001402 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1403 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001404 PyErr_NoMemory();
1405 return -1;
1406 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001407 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001408 _PyUnicode_WSTR(unicode), end,
1409 PyUnicode_1BYTE_DATA(unicode));
1410 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1411 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1412 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1413 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001414 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001415 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001416 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001417 }
1418 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001419 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001420 _PyUnicode_UTF8(unicode) = NULL;
1421 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001422 }
1423 PyObject_FREE(_PyUnicode_WSTR(unicode));
1424 _PyUnicode_WSTR(unicode) = NULL;
1425 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1426 }
1427 /* In this case we might have to convert down from 4-byte native
1428 wchar_t to 2-byte unicode. */
1429 else if (maxchar < 65536) {
1430 assert(num_surrogates == 0 &&
1431 "FindMaxCharAndNumSurrogatePairs() messed up");
1432
Victor Stinner506f5922011-09-28 22:34:18 +02001433#if SIZEOF_WCHAR_T == 2
1434 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001435 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001436 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1437 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1438 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001439 _PyUnicode_UTF8(unicode) = NULL;
1440 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001441#else
1442 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001443 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001444 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001445 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001446 PyErr_NoMemory();
1447 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001448 }
Victor Stinner506f5922011-09-28 22:34:18 +02001449 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1450 _PyUnicode_WSTR(unicode), end,
1451 PyUnicode_2BYTE_DATA(unicode));
1452 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1453 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1454 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001455 _PyUnicode_UTF8(unicode) = NULL;
1456 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001457 PyObject_FREE(_PyUnicode_WSTR(unicode));
1458 _PyUnicode_WSTR(unicode) = NULL;
1459 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1460#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001461 }
1462 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1463 else {
1464#if SIZEOF_WCHAR_T == 2
1465 /* in case the native representation is 2-bytes, we need to allocate a
1466 new normalized 4-byte version. */
1467 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001468 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1469 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001470 PyErr_NoMemory();
1471 return -1;
1472 }
1473 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1474 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001475 _PyUnicode_UTF8(unicode) = NULL;
1476 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001477 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1478 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001479 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001480 PyObject_FREE(_PyUnicode_WSTR(unicode));
1481 _PyUnicode_WSTR(unicode) = NULL;
1482 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1483#else
1484 assert(num_surrogates == 0);
1485
Victor Stinnerc3c74152011-10-02 20:39:55 +02001486 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001487 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001488 _PyUnicode_UTF8(unicode) = NULL;
1489 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001490 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1491#endif
1492 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1493 }
1494 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001495 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001496 return 0;
1497}
1498
Alexander Belopolsky40018472011-02-26 01:02:56 +00001499static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001500unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001501{
Walter Dörwald16807132007-05-25 13:52:07 +00001502 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001503 case SSTATE_NOT_INTERNED:
1504 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001505
Benjamin Peterson29060642009-01-31 22:14:21 +00001506 case SSTATE_INTERNED_MORTAL:
1507 /* revive dead object temporarily for DelItem */
1508 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001509 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001510 Py_FatalError(
1511 "deletion of interned string failed");
1512 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001513
Benjamin Peterson29060642009-01-31 22:14:21 +00001514 case SSTATE_INTERNED_IMMORTAL:
1515 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001516
Benjamin Peterson29060642009-01-31 22:14:21 +00001517 default:
1518 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001519 }
1520
Victor Stinner03490912011-10-03 23:45:12 +02001521 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001522 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001523 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001524 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001525 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1526 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001527
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001528 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001529}
1530
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001531#ifdef Py_DEBUG
1532static int
1533unicode_is_singleton(PyObject *unicode)
1534{
1535 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1536 if (unicode == unicode_empty)
1537 return 1;
1538 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1539 {
1540 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1541 if (ch < 256 && unicode_latin1[ch] == unicode)
1542 return 1;
1543 }
1544 return 0;
1545}
1546#endif
1547
Alexander Belopolsky40018472011-02-26 01:02:56 +00001548static int
Victor Stinner488fa492011-12-12 00:01:39 +01001549unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001550{
Victor Stinner488fa492011-12-12 00:01:39 +01001551 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001552 if (Py_REFCNT(unicode) != 1)
1553 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001554 if (_PyUnicode_HASH(unicode) != -1)
1555 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001556 if (PyUnicode_CHECK_INTERNED(unicode))
1557 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001558 if (!PyUnicode_CheckExact(unicode))
1559 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001560#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001561 /* singleton refcount is greater than 1 */
1562 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001563#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001564 return 1;
1565}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001566
Victor Stinnerfe226c02011-10-03 03:52:20 +02001567static int
1568unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1569{
1570 PyObject *unicode;
1571 Py_ssize_t old_length;
1572
1573 assert(p_unicode != NULL);
1574 unicode = *p_unicode;
1575
1576 assert(unicode != NULL);
1577 assert(PyUnicode_Check(unicode));
1578 assert(0 <= length);
1579
Victor Stinner910337b2011-10-03 03:20:16 +02001580 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001581 old_length = PyUnicode_WSTR_LENGTH(unicode);
1582 else
1583 old_length = PyUnicode_GET_LENGTH(unicode);
1584 if (old_length == length)
1585 return 0;
1586
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001587 if (length == 0) {
1588 Py_DECREF(*p_unicode);
1589 *p_unicode = unicode_empty;
1590 Py_INCREF(*p_unicode);
1591 return 0;
1592 }
1593
Victor Stinner488fa492011-12-12 00:01:39 +01001594 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001595 PyObject *copy = resize_copy(unicode, length);
1596 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001597 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001598 Py_DECREF(*p_unicode);
1599 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001600 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001601 }
1602
Victor Stinnerfe226c02011-10-03 03:52:20 +02001603 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001604 PyObject *new_unicode = resize_compact(unicode, length);
1605 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001606 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001607 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001608 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001609 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001610 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001611}
1612
Alexander Belopolsky40018472011-02-26 01:02:56 +00001613int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001614PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001615{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001616 PyObject *unicode;
1617 if (p_unicode == NULL) {
1618 PyErr_BadInternalCall();
1619 return -1;
1620 }
1621 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001622 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001623 {
1624 PyErr_BadInternalCall();
1625 return -1;
1626 }
1627 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001628}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001629
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001630static int
Victor Stinner1b487b42012-05-03 12:29:04 +02001631unicode_widen(PyObject **p_unicode, Py_ssize_t length,
1632 unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001633{
1634 PyObject *result;
1635 assert(PyUnicode_IS_READY(*p_unicode));
Victor Stinner1b487b42012-05-03 12:29:04 +02001636 assert(length <= PyUnicode_GET_LENGTH(*p_unicode));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001637 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1638 return 0;
1639 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1640 maxchar);
1641 if (result == NULL)
1642 return -1;
Victor Stinner1b487b42012-05-03 12:29:04 +02001643 PyUnicode_CopyCharacters(result, 0, *p_unicode, 0, length);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001644 Py_DECREF(*p_unicode);
1645 *p_unicode = result;
1646 return 0;
1647}
1648
1649static int
1650unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1651 Py_UCS4 ch)
1652{
Victor Stinner15e9ed22012-02-22 13:36:20 +01001653 assert(ch <= MAX_UNICODE);
Victor Stinner1b487b42012-05-03 12:29:04 +02001654 if (unicode_widen(p_unicode, *pos, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001655 return -1;
1656 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1657 PyUnicode_DATA(*p_unicode),
1658 (*pos)++, ch);
1659 return 0;
1660}
1661
Victor Stinnerc5166102012-02-22 13:55:02 +01001662/* Copy a ASCII or latin1 char* string into a Python Unicode string.
1663 Return the length of the input string.
1664
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001665 WARNING: The function doesn't copy the terminating null character and
1666 doesn't check the maximum character (may write a latin1 character in an
1667 ASCII string). */
Victor Stinnerc5166102012-02-22 13:55:02 +01001668static Py_ssize_t
1669unicode_write_cstr(PyObject *unicode, Py_ssize_t index, const char *str)
1670{
1671 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1672 void *data = PyUnicode_DATA(unicode);
1673
1674 switch (kind) {
1675 case PyUnicode_1BYTE_KIND: {
1676 Py_ssize_t len = strlen(str);
1677 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001678 memcpy((char *) data + index, str, len);
Victor Stinnerc5166102012-02-22 13:55:02 +01001679 return len;
1680 }
1681 case PyUnicode_2BYTE_KIND: {
1682 Py_UCS2 *start = (Py_UCS2 *)data + index;
1683 Py_UCS2 *ucs2 = start;
1684 assert(index <= PyUnicode_GET_LENGTH(unicode));
1685
1686 for (; *str; ++ucs2, ++str)
1687 *ucs2 = (Py_UCS2)*str;
1688
1689 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
1690 return ucs2 - start;
1691 }
1692 default: {
1693 Py_UCS4 *start = (Py_UCS4 *)data + index;
1694 Py_UCS4 *ucs4 = start;
1695 assert(kind == PyUnicode_4BYTE_KIND);
1696 assert(index <= PyUnicode_GET_LENGTH(unicode));
1697
1698 for (; *str; ++ucs4, ++str)
1699 *ucs4 = (Py_UCS4)*str;
1700
1701 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
1702 return ucs4 - start;
1703 }
1704 }
1705}
1706
1707
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001708static PyObject*
1709get_latin1_char(unsigned char ch)
1710{
Victor Stinnera464fc12011-10-02 20:39:30 +02001711 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001712 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001713 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001714 if (!unicode)
1715 return NULL;
1716 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001717 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001718 unicode_latin1[ch] = unicode;
1719 }
1720 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001721 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001722}
1723
Alexander Belopolsky40018472011-02-26 01:02:56 +00001724PyObject *
1725PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001726{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001727 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001728 Py_UCS4 maxchar = 0;
1729 Py_ssize_t num_surrogates;
1730
1731 if (u == NULL)
1732 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001733
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001734 /* If the Unicode data is known at construction time, we can apply
1735 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001736
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001737 /* Optimization for empty strings */
1738 if (size == 0 && unicode_empty != NULL) {
1739 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001740 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001741 }
Tim Petersced69f82003-09-16 20:30:58 +00001742
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001743 /* Single character Unicode objects in the Latin-1 range are
1744 shared when using this constructor */
1745 if (size == 1 && *u < 256)
1746 return get_latin1_char((unsigned char)*u);
1747
1748 /* If not empty and not single character, copy the Unicode data
1749 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001750 if (find_maxchar_surrogates(u, u + size,
1751 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001752 return NULL;
1753
Victor Stinner8faf8212011-12-08 22:14:11 +01001754 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001755 if (!unicode)
1756 return NULL;
1757
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001758 switch (PyUnicode_KIND(unicode)) {
1759 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001760 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001761 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1762 break;
1763 case PyUnicode_2BYTE_KIND:
1764#if Py_UNICODE_SIZE == 2
1765 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1766#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001767 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001768 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1769#endif
1770 break;
1771 case PyUnicode_4BYTE_KIND:
1772#if SIZEOF_WCHAR_T == 2
1773 /* This is the only case which has to process surrogates, thus
1774 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001775 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001776#else
1777 assert(num_surrogates == 0);
1778 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1779#endif
1780 break;
1781 default:
1782 assert(0 && "Impossible state");
1783 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001784
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001785 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001786}
1787
Alexander Belopolsky40018472011-02-26 01:02:56 +00001788PyObject *
1789PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001790{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001791 if (size < 0) {
1792 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001793 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001794 return NULL;
1795 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001796 if (u != NULL)
1797 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1798 else
1799 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001800}
1801
Alexander Belopolsky40018472011-02-26 01:02:56 +00001802PyObject *
1803PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001804{
1805 size_t size = strlen(u);
1806 if (size > PY_SSIZE_T_MAX) {
1807 PyErr_SetString(PyExc_OverflowError, "input too long");
1808 return NULL;
1809 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001810 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001811}
1812
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001813PyObject *
1814_PyUnicode_FromId(_Py_Identifier *id)
1815{
1816 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001817 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1818 strlen(id->string),
1819 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001820 if (!id->object)
1821 return NULL;
1822 PyUnicode_InternInPlace(&id->object);
1823 assert(!id->next);
1824 id->next = static_strings;
1825 static_strings = id;
1826 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001827 return id->object;
1828}
1829
1830void
1831_PyUnicode_ClearStaticStrings()
1832{
1833 _Py_Identifier *i;
1834 for (i = static_strings; i; i = i->next) {
1835 Py_DECREF(i->object);
1836 i->object = NULL;
1837 i->next = NULL;
1838 }
1839}
1840
Benjamin Peterson0df54292012-03-26 14:50:32 -04001841/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001842
Victor Stinnere57b1c02011-09-28 22:20:48 +02001843static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001844unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001845{
Victor Stinner785938e2011-12-11 20:09:03 +01001846 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001847 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001848#ifdef Py_DEBUG
Victor Stinnere6b2d442011-12-11 21:54:30 +01001849 assert(s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001850#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001851 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001852 }
Victor Stinner785938e2011-12-11 20:09:03 +01001853 unicode = PyUnicode_New(size, 127);
1854 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001855 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001856 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1857 assert(_PyUnicode_CheckConsistency(unicode, 1));
1858 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001859}
1860
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001861static Py_UCS4
1862kind_maxchar_limit(unsigned int kind)
1863{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001864 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001865 case PyUnicode_1BYTE_KIND:
1866 return 0x80;
1867 case PyUnicode_2BYTE_KIND:
1868 return 0x100;
1869 case PyUnicode_4BYTE_KIND:
1870 return 0x10000;
1871 default:
1872 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001873 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001874 }
1875}
1876
Victor Stinnere6abb482012-05-02 01:15:40 +02001877Py_LOCAL_INLINE(Py_UCS4)
1878align_maxchar(Py_UCS4 maxchar)
1879{
1880 if (maxchar <= 127)
1881 return 127;
1882 else if (maxchar <= 255)
1883 return 255;
1884 else if (maxchar <= 65535)
1885 return 65535;
1886 else
1887 return MAX_UNICODE;
1888}
1889
Victor Stinner702c7342011-10-05 13:50:52 +02001890static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001891_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001892{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001893 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001894 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001895
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001896 if (size == 0) {
1897 Py_INCREF(unicode_empty);
1898 return unicode_empty;
1899 }
1900 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001901 if (size == 1)
1902 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001903
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001904 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001905 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001906 if (!res)
1907 return NULL;
1908 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001909 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001910 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001911}
1912
Victor Stinnere57b1c02011-09-28 22:20:48 +02001913static PyObject*
1914_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001915{
1916 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001917 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001918
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001919 if (size == 0) {
1920 Py_INCREF(unicode_empty);
1921 return unicode_empty;
1922 }
1923 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001924 if (size == 1) {
1925 Py_UCS4 ch = u[0];
1926 if (ch < 256)
1927 return get_latin1_char((unsigned char)ch);
1928
1929 res = PyUnicode_New(1, ch);
1930 if (res == NULL)
1931 return NULL;
1932 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1933 assert(_PyUnicode_CheckConsistency(res, 1));
1934 return res;
1935 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001936
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001937 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001938 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001939 if (!res)
1940 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001941 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001942 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001943 else {
1944 _PyUnicode_CONVERT_BYTES(
1945 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1946 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001947 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001948 return res;
1949}
1950
Victor Stinnere57b1c02011-09-28 22:20:48 +02001951static PyObject*
1952_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001953{
1954 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001955 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001956
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001957 if (size == 0) {
1958 Py_INCREF(unicode_empty);
1959 return unicode_empty;
1960 }
1961 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001962 if (size == 1) {
1963 Py_UCS4 ch = u[0];
1964 if (ch < 256)
1965 return get_latin1_char((unsigned char)ch);
1966
1967 res = PyUnicode_New(1, ch);
1968 if (res == NULL)
1969 return NULL;
1970 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1971 assert(_PyUnicode_CheckConsistency(res, 1));
1972 return res;
1973 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001974
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001975 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001976 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001977 if (!res)
1978 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001979 if (max_char < 256)
1980 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1981 PyUnicode_1BYTE_DATA(res));
1982 else if (max_char < 0x10000)
1983 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1984 PyUnicode_2BYTE_DATA(res));
1985 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001986 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001987 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001988 return res;
1989}
1990
1991PyObject*
1992PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1993{
Victor Stinnercfed46e2011-11-22 01:29:14 +01001994 if (size < 0) {
1995 PyErr_SetString(PyExc_ValueError, "size must be positive");
1996 return NULL;
1997 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06001998 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001999 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002000 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002001 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002002 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002003 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002004 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002005 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002006 PyErr_SetString(PyExc_SystemError, "invalid kind");
2007 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002008 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002009}
2010
Victor Stinnerece58de2012-04-23 23:36:38 +02002011Py_UCS4
2012_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2013{
2014 enum PyUnicode_Kind kind;
2015 void *startptr, *endptr;
2016
2017 assert(PyUnicode_IS_READY(unicode));
2018 assert(0 <= start);
2019 assert(end <= PyUnicode_GET_LENGTH(unicode));
2020 assert(start <= end);
2021
2022 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2023 return PyUnicode_MAX_CHAR_VALUE(unicode);
2024
2025 if (start == end)
2026 return 127;
2027
Victor Stinner94d558b2012-04-27 22:26:58 +02002028 if (PyUnicode_IS_ASCII(unicode))
2029 return 127;
2030
Victor Stinnerece58de2012-04-23 23:36:38 +02002031 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002032 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002033 endptr = (char *)startptr + end * kind;
2034 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002035 switch(kind) {
2036 case PyUnicode_1BYTE_KIND:
2037 return ucs1lib_find_max_char(startptr, endptr);
2038 case PyUnicode_2BYTE_KIND:
2039 return ucs2lib_find_max_char(startptr, endptr);
2040 case PyUnicode_4BYTE_KIND:
2041 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002042 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002043 assert(0);
2044 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002045 }
2046}
2047
Victor Stinner25a4b292011-10-06 12:31:55 +02002048/* Ensure that a string uses the most efficient storage, if it is not the
2049 case: create a new string with of the right kind. Write NULL into *p_unicode
2050 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002051static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002052unicode_adjust_maxchar(PyObject **p_unicode)
2053{
2054 PyObject *unicode, *copy;
2055 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002056 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002057 unsigned int kind;
2058
2059 assert(p_unicode != NULL);
2060 unicode = *p_unicode;
2061 assert(PyUnicode_IS_READY(unicode));
2062 if (PyUnicode_IS_ASCII(unicode))
2063 return;
2064
2065 len = PyUnicode_GET_LENGTH(unicode);
2066 kind = PyUnicode_KIND(unicode);
2067 if (kind == PyUnicode_1BYTE_KIND) {
2068 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002069 max_char = ucs1lib_find_max_char(u, u + len);
2070 if (max_char >= 128)
2071 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002072 }
2073 else if (kind == PyUnicode_2BYTE_KIND) {
2074 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002075 max_char = ucs2lib_find_max_char(u, u + len);
2076 if (max_char >= 256)
2077 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002078 }
2079 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002080 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002081 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002082 max_char = ucs4lib_find_max_char(u, u + len);
2083 if (max_char >= 0x10000)
2084 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002085 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002086 copy = PyUnicode_New(len, max_char);
2087 copy_characters(copy, 0, unicode, 0, len);
2088 Py_DECREF(unicode);
2089 *p_unicode = copy;
2090}
2091
Victor Stinner034f6cf2011-09-30 02:26:44 +02002092PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002093_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002094{
Victor Stinner87af4f22011-11-21 23:03:47 +01002095 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002096 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002097
Victor Stinner034f6cf2011-09-30 02:26:44 +02002098 if (!PyUnicode_Check(unicode)) {
2099 PyErr_BadInternalCall();
2100 return NULL;
2101 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002102 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002103 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002104
Victor Stinner87af4f22011-11-21 23:03:47 +01002105 length = PyUnicode_GET_LENGTH(unicode);
2106 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002107 if (!copy)
2108 return NULL;
2109 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2110
Victor Stinner87af4f22011-11-21 23:03:47 +01002111 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2112 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002113 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002114 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002115}
2116
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002117
Victor Stinnerbc603d12011-10-02 01:00:40 +02002118/* Widen Unicode objects to larger buffers. Don't write terminating null
2119 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002120
2121void*
2122_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2123{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002124 Py_ssize_t len;
2125 void *result;
2126 unsigned int skind;
2127
Benjamin Petersonbac79492012-01-14 13:34:47 -05002128 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002129 return NULL;
2130
2131 len = PyUnicode_GET_LENGTH(s);
2132 skind = PyUnicode_KIND(s);
2133 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002134 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002135 return NULL;
2136 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002137 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002138 case PyUnicode_2BYTE_KIND:
2139 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2140 if (!result)
2141 return PyErr_NoMemory();
2142 assert(skind == PyUnicode_1BYTE_KIND);
2143 _PyUnicode_CONVERT_BYTES(
2144 Py_UCS1, Py_UCS2,
2145 PyUnicode_1BYTE_DATA(s),
2146 PyUnicode_1BYTE_DATA(s) + len,
2147 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002148 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002149 case PyUnicode_4BYTE_KIND:
2150 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2151 if (!result)
2152 return PyErr_NoMemory();
2153 if (skind == PyUnicode_2BYTE_KIND) {
2154 _PyUnicode_CONVERT_BYTES(
2155 Py_UCS2, Py_UCS4,
2156 PyUnicode_2BYTE_DATA(s),
2157 PyUnicode_2BYTE_DATA(s) + len,
2158 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002159 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002160 else {
2161 assert(skind == PyUnicode_1BYTE_KIND);
2162 _PyUnicode_CONVERT_BYTES(
2163 Py_UCS1, Py_UCS4,
2164 PyUnicode_1BYTE_DATA(s),
2165 PyUnicode_1BYTE_DATA(s) + len,
2166 result);
2167 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002168 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002169 default:
2170 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002171 }
Victor Stinner01698042011-10-04 00:04:26 +02002172 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002173 return NULL;
2174}
2175
2176static Py_UCS4*
2177as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2178 int copy_null)
2179{
2180 int kind;
2181 void *data;
2182 Py_ssize_t len, targetlen;
2183 if (PyUnicode_READY(string) == -1)
2184 return NULL;
2185 kind = PyUnicode_KIND(string);
2186 data = PyUnicode_DATA(string);
2187 len = PyUnicode_GET_LENGTH(string);
2188 targetlen = len;
2189 if (copy_null)
2190 targetlen++;
2191 if (!target) {
2192 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2193 PyErr_NoMemory();
2194 return NULL;
2195 }
2196 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2197 if (!target) {
2198 PyErr_NoMemory();
2199 return NULL;
2200 }
2201 }
2202 else {
2203 if (targetsize < targetlen) {
2204 PyErr_Format(PyExc_SystemError,
2205 "string is longer than the buffer");
2206 if (copy_null && 0 < targetsize)
2207 target[0] = 0;
2208 return NULL;
2209 }
2210 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002211 if (kind == PyUnicode_1BYTE_KIND) {
2212 Py_UCS1 *start = (Py_UCS1 *) data;
2213 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002214 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002215 else if (kind == PyUnicode_2BYTE_KIND) {
2216 Py_UCS2 *start = (Py_UCS2 *) data;
2217 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2218 }
2219 else {
2220 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002221 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002222 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002223 if (copy_null)
2224 target[len] = 0;
2225 return target;
2226}
2227
2228Py_UCS4*
2229PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2230 int copy_null)
2231{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002232 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002233 PyErr_BadInternalCall();
2234 return NULL;
2235 }
2236 return as_ucs4(string, target, targetsize, copy_null);
2237}
2238
2239Py_UCS4*
2240PyUnicode_AsUCS4Copy(PyObject *string)
2241{
2242 return as_ucs4(string, NULL, 0, 1);
2243}
2244
2245#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002246
Alexander Belopolsky40018472011-02-26 01:02:56 +00002247PyObject *
2248PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002249{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002250 if (w == NULL) {
Victor Stinner382955f2011-12-11 21:44:00 +01002251 if (size == 0) {
2252 Py_INCREF(unicode_empty);
2253 return unicode_empty;
2254 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002255 PyErr_BadInternalCall();
2256 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002257 }
2258
Martin v. Löwis790465f2008-04-05 20:41:37 +00002259 if (size == -1) {
2260 size = wcslen(w);
2261 }
2262
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002263 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002264}
2265
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002266#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002267
Walter Dörwald346737f2007-05-31 10:44:43 +00002268static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002269makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2270 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002271{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002272 *fmt++ = '%';
2273 if (width) {
2274 if (zeropad)
2275 *fmt++ = '0';
2276 fmt += sprintf(fmt, "%d", width);
2277 }
2278 if (precision)
2279 fmt += sprintf(fmt, ".%d", precision);
2280 if (longflag)
2281 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002282 else if (longlongflag) {
2283 /* longlongflag should only ever be nonzero on machines with
2284 HAVE_LONG_LONG defined */
2285#ifdef HAVE_LONG_LONG
2286 char *f = PY_FORMAT_LONG_LONG;
2287 while (*f)
2288 *fmt++ = *f++;
2289#else
2290 /* we shouldn't ever get here */
2291 assert(0);
2292 *fmt++ = 'l';
2293#endif
2294 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002295 else if (size_tflag) {
2296 char *f = PY_FORMAT_SIZE_T;
2297 while (*f)
2298 *fmt++ = *f++;
2299 }
2300 *fmt++ = c;
2301 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002302}
2303
Victor Stinner96865452011-03-01 23:44:09 +00002304/* helper for PyUnicode_FromFormatV() */
2305
2306static const char*
2307parse_format_flags(const char *f,
2308 int *p_width, int *p_precision,
2309 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2310{
2311 int width, precision, longflag, longlongflag, size_tflag;
2312
2313 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2314 f++;
2315 width = 0;
2316 while (Py_ISDIGIT((unsigned)*f))
2317 width = (width*10) + *f++ - '0';
2318 precision = 0;
2319 if (*f == '.') {
2320 f++;
2321 while (Py_ISDIGIT((unsigned)*f))
2322 precision = (precision*10) + *f++ - '0';
2323 if (*f == '%') {
2324 /* "%.3%s" => f points to "3" */
2325 f--;
2326 }
2327 }
2328 if (*f == '\0') {
2329 /* bogus format "%.1" => go backward, f points to "1" */
2330 f--;
2331 }
2332 if (p_width != NULL)
2333 *p_width = width;
2334 if (p_precision != NULL)
2335 *p_precision = precision;
2336
2337 /* Handle %ld, %lu, %lld and %llu. */
2338 longflag = 0;
2339 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002340 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002341
2342 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002343 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002344 longflag = 1;
2345 ++f;
2346 }
2347#ifdef HAVE_LONG_LONG
2348 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002349 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002350 longlongflag = 1;
2351 f += 2;
2352 }
2353#endif
2354 }
2355 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002356 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002357 size_tflag = 1;
2358 ++f;
2359 }
2360 if (p_longflag != NULL)
2361 *p_longflag = longflag;
2362 if (p_longlongflag != NULL)
2363 *p_longlongflag = longlongflag;
2364 if (p_size_tflag != NULL)
2365 *p_size_tflag = size_tflag;
2366 return f;
2367}
2368
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002369/* maximum number of characters required for output of %ld. 21 characters
2370 allows for 64-bit integers (in decimal) and an optional sign. */
2371#define MAX_LONG_CHARS 21
2372/* maximum number of characters required for output of %lld.
2373 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2374 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2375#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2376
Walter Dörwaldd2034312007-05-18 16:29:38 +00002377PyObject *
2378PyUnicode_FromFormatV(const char *format, va_list vargs)
2379{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002380 va_list count;
2381 Py_ssize_t callcount = 0;
2382 PyObject **callresults = NULL;
2383 PyObject **callresult = NULL;
2384 Py_ssize_t n = 0;
2385 int width = 0;
2386 int precision = 0;
2387 int zeropad;
2388 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002389 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002390 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002391 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002392 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2393 Py_UCS4 argmaxchar;
2394 Py_ssize_t numbersize = 0;
2395 char *numberresults = NULL;
2396 char *numberresult = NULL;
2397 Py_ssize_t i;
2398 int kind;
2399 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002400
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002401 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002402 /* step 1: count the number of %S/%R/%A/%s format specifications
2403 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2404 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002405 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002406 * also estimate a upper bound for all the number formats in the string,
2407 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002408 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002409 for (f = format; *f; f++) {
2410 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002411 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002412 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2413 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2414 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2415 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002416
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002417 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002418#ifdef HAVE_LONG_LONG
2419 if (longlongflag) {
2420 if (width < MAX_LONG_LONG_CHARS)
2421 width = MAX_LONG_LONG_CHARS;
2422 }
2423 else
2424#endif
2425 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2426 including sign. Decimal takes the most space. This
2427 isn't enough for octal. If a width is specified we
2428 need more (which we allocate later). */
2429 if (width < MAX_LONG_CHARS)
2430 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002431
2432 /* account for the size + '\0' to separate numbers
2433 inside of the numberresults buffer */
2434 numbersize += (width + 1);
2435 }
2436 }
2437 else if ((unsigned char)*f > 127) {
2438 PyErr_Format(PyExc_ValueError,
2439 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2440 "string, got a non-ASCII byte: 0x%02x",
2441 (unsigned char)*f);
2442 return NULL;
2443 }
2444 }
2445 /* step 2: allocate memory for the results of
2446 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2447 if (callcount) {
2448 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2449 if (!callresults) {
2450 PyErr_NoMemory();
2451 return NULL;
2452 }
2453 callresult = callresults;
2454 }
2455 /* step 2.5: allocate memory for the results of formating numbers */
2456 if (numbersize) {
2457 numberresults = PyObject_Malloc(numbersize);
2458 if (!numberresults) {
2459 PyErr_NoMemory();
2460 goto fail;
2461 }
2462 numberresult = numberresults;
2463 }
2464
2465 /* step 3: format numbers and figure out how large a buffer we need */
2466 for (f = format; *f; f++) {
2467 if (*f == '%') {
2468 const char* p;
2469 int longflag;
2470 int longlongflag;
2471 int size_tflag;
2472 int numprinted;
2473
2474 p = f;
2475 zeropad = (f[1] == '0');
2476 f = parse_format_flags(f, &width, &precision,
2477 &longflag, &longlongflag, &size_tflag);
2478 switch (*f) {
2479 case 'c':
2480 {
2481 Py_UCS4 ordinal = va_arg(count, int);
Victor Stinnere6abb482012-05-02 01:15:40 +02002482 maxchar = MAX_MAXCHAR(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002483 n++;
2484 break;
2485 }
2486 case '%':
2487 n++;
2488 break;
2489 case 'i':
2490 case 'd':
2491 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2492 width, precision, *f);
2493 if (longflag)
2494 numprinted = sprintf(numberresult, fmt,
2495 va_arg(count, long));
2496#ifdef HAVE_LONG_LONG
2497 else if (longlongflag)
2498 numprinted = sprintf(numberresult, fmt,
2499 va_arg(count, PY_LONG_LONG));
2500#endif
2501 else if (size_tflag)
2502 numprinted = sprintf(numberresult, fmt,
2503 va_arg(count, Py_ssize_t));
2504 else
2505 numprinted = sprintf(numberresult, fmt,
2506 va_arg(count, int));
2507 n += numprinted;
2508 /* advance by +1 to skip over the '\0' */
2509 numberresult += (numprinted + 1);
2510 assert(*(numberresult - 1) == '\0');
2511 assert(*(numberresult - 2) != '\0');
2512 assert(numprinted >= 0);
2513 assert(numberresult <= numberresults + numbersize);
2514 break;
2515 case 'u':
2516 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2517 width, precision, 'u');
2518 if (longflag)
2519 numprinted = sprintf(numberresult, fmt,
2520 va_arg(count, unsigned long));
2521#ifdef HAVE_LONG_LONG
2522 else if (longlongflag)
2523 numprinted = sprintf(numberresult, fmt,
2524 va_arg(count, unsigned PY_LONG_LONG));
2525#endif
2526 else if (size_tflag)
2527 numprinted = sprintf(numberresult, fmt,
2528 va_arg(count, size_t));
2529 else
2530 numprinted = sprintf(numberresult, fmt,
2531 va_arg(count, unsigned int));
2532 n += numprinted;
2533 numberresult += (numprinted + 1);
2534 assert(*(numberresult - 1) == '\0');
2535 assert(*(numberresult - 2) != '\0');
2536 assert(numprinted >= 0);
2537 assert(numberresult <= numberresults + numbersize);
2538 break;
2539 case 'x':
2540 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2541 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2542 n += numprinted;
2543 numberresult += (numprinted + 1);
2544 assert(*(numberresult - 1) == '\0');
2545 assert(*(numberresult - 2) != '\0');
2546 assert(numprinted >= 0);
2547 assert(numberresult <= numberresults + numbersize);
2548 break;
2549 case 'p':
2550 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2551 /* %p is ill-defined: ensure leading 0x. */
2552 if (numberresult[1] == 'X')
2553 numberresult[1] = 'x';
2554 else if (numberresult[1] != 'x') {
2555 memmove(numberresult + 2, numberresult,
2556 strlen(numberresult) + 1);
2557 numberresult[0] = '0';
2558 numberresult[1] = 'x';
2559 numprinted += 2;
2560 }
2561 n += numprinted;
2562 numberresult += (numprinted + 1);
2563 assert(*(numberresult - 1) == '\0');
2564 assert(*(numberresult - 2) != '\0');
2565 assert(numprinted >= 0);
2566 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002567 break;
2568 case 's':
2569 {
2570 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002571 const char *s = va_arg(count, const char*);
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002572 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002573 if (!str)
2574 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002575 /* since PyUnicode_DecodeUTF8 returns already flexible
2576 unicode objects, there is no need to call ready on them */
2577 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Victor Stinnere6abb482012-05-02 01:15:40 +02002578 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002579 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002580 /* Remember the str and switch to the next slot */
2581 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002582 break;
2583 }
2584 case 'U':
2585 {
2586 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002587 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002588 if (PyUnicode_READY(obj) == -1)
2589 goto fail;
2590 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Victor Stinnere6abb482012-05-02 01:15:40 +02002591 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002592 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002593 break;
2594 }
2595 case 'V':
2596 {
2597 PyObject *obj = va_arg(count, PyObject *);
2598 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002599 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002600 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002601 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002602 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002603 if (PyUnicode_READY(obj) == -1)
2604 goto fail;
2605 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Victor Stinnere6abb482012-05-02 01:15:40 +02002606 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002607 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002608 *callresult++ = NULL;
2609 }
2610 else {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002611 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002612 if (!str_obj)
2613 goto fail;
Benjamin Petersonbac79492012-01-14 13:34:47 -05002614 if (PyUnicode_READY(str_obj) == -1) {
Victor Stinnere1335c72011-10-04 20:53:03 +02002615 Py_DECREF(str_obj);
2616 goto fail;
2617 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002618 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Victor Stinnere6abb482012-05-02 01:15:40 +02002619 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002620 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002621 *callresult++ = str_obj;
2622 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002623 break;
2624 }
2625 case 'S':
2626 {
2627 PyObject *obj = va_arg(count, PyObject *);
2628 PyObject *str;
2629 assert(obj);
2630 str = PyObject_Str(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002631 if (!str)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002632 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002633 if (PyUnicode_READY(str) == -1) {
2634 Py_DECREF(str);
2635 goto fail;
2636 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002637 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Victor Stinnere6abb482012-05-02 01:15:40 +02002638 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002639 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002640 /* Remember the str and switch to the next slot */
2641 *callresult++ = str;
2642 break;
2643 }
2644 case 'R':
2645 {
2646 PyObject *obj = va_arg(count, PyObject *);
2647 PyObject *repr;
2648 assert(obj);
2649 repr = PyObject_Repr(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002650 if (!repr)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002651 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002652 if (PyUnicode_READY(repr) == -1) {
2653 Py_DECREF(repr);
2654 goto fail;
2655 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002656 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Victor Stinnere6abb482012-05-02 01:15:40 +02002657 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002658 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002659 /* Remember the repr and switch to the next slot */
2660 *callresult++ = repr;
2661 break;
2662 }
2663 case 'A':
2664 {
2665 PyObject *obj = va_arg(count, PyObject *);
2666 PyObject *ascii;
2667 assert(obj);
2668 ascii = PyObject_ASCII(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002669 if (!ascii)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002670 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002671 if (PyUnicode_READY(ascii) == -1) {
2672 Py_DECREF(ascii);
2673 goto fail;
2674 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002675 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Victor Stinnere6abb482012-05-02 01:15:40 +02002676 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002677 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002678 /* Remember the repr and switch to the next slot */
2679 *callresult++ = ascii;
2680 break;
2681 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002682 default:
2683 /* if we stumble upon an unknown
2684 formatting code, copy the rest of
2685 the format string to the output
2686 string. (we cannot just skip the
2687 code, since there's no way to know
2688 what's in the argument list) */
2689 n += strlen(p);
2690 goto expand;
2691 }
2692 } else
2693 n++;
2694 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002695 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002696 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002697 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002698 we don't have to resize the string.
2699 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002700 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002701 if (!string)
2702 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002703 kind = PyUnicode_KIND(string);
2704 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002705 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002706 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002707
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002708 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002709 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002710 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002711
2712 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002713 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2714 /* checking for == because the last argument could be a empty
2715 string, which causes i to point to end, the assert at the end of
2716 the loop */
2717 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002718
Benjamin Peterson14339b62009-01-31 16:36:08 +00002719 switch (*f) {
2720 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002721 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002722 const int ordinal = va_arg(vargs, int);
2723 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002724 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002725 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002726 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002727 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002728 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002729 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002730 case 'p':
Victor Stinnerc5166102012-02-22 13:55:02 +01002731 {
2732 Py_ssize_t written;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002733 /* unused, since we already have the result */
2734 if (*f == 'p')
2735 (void) va_arg(vargs, void *);
2736 else
2737 (void) va_arg(vargs, int);
2738 /* extract the result from numberresults and append. */
Victor Stinnerc5166102012-02-22 13:55:02 +01002739 written = unicode_write_cstr(string, i, numberresult);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002740 /* skip over the separating '\0' */
Victor Stinnerc5166102012-02-22 13:55:02 +01002741 i += written;
2742 numberresult += written;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002743 assert(*numberresult == '\0');
2744 numberresult++;
2745 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002746 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002747 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002748 case 's':
2749 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002750 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002751 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002752 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002753 size = PyUnicode_GET_LENGTH(*callresult);
2754 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002755 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002756 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002757 /* We're done with the unicode()/repr() => forget it */
2758 Py_DECREF(*callresult);
2759 /* switch to next unicode()/repr() result */
2760 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002761 break;
2762 }
2763 case 'U':
2764 {
2765 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002766 Py_ssize_t size;
2767 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2768 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002769 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002770 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002771 break;
2772 }
2773 case 'V':
2774 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002775 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002776 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002777 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002778 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002779 size = PyUnicode_GET_LENGTH(obj);
2780 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002781 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002782 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002783 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002784 size = PyUnicode_GET_LENGTH(*callresult);
2785 assert(PyUnicode_KIND(*callresult) <=
2786 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002787 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002788 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002789 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002790 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002791 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002792 break;
2793 }
2794 case 'S':
2795 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002796 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002797 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002798 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002799 /* unused, since we already have the result */
2800 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002801 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002802 copy_characters(string, i, *callresult, 0, size);
2803 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002804 /* We're done with the unicode()/repr() => forget it */
2805 Py_DECREF(*callresult);
2806 /* switch to next unicode()/repr() result */
2807 ++callresult;
2808 break;
2809 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002810 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002811 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002812 break;
2813 default:
Victor Stinnerc5166102012-02-22 13:55:02 +01002814 i += unicode_write_cstr(string, i, p);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002815 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002816 goto end;
2817 }
Victor Stinner1205f272010-09-11 00:54:47 +00002818 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002819 else {
2820 assert(i < PyUnicode_GET_LENGTH(string));
2821 PyUnicode_WRITE(kind, data, i++, *f);
2822 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002823 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002824 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002825
Benjamin Peterson29060642009-01-31 22:14:21 +00002826 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002827 if (callresults)
2828 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002829 if (numberresults)
2830 PyObject_Free(numberresults);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002831 return unicode_result(string);
Benjamin Peterson29060642009-01-31 22:14:21 +00002832 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002833 if (callresults) {
2834 PyObject **callresult2 = callresults;
2835 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002836 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002837 ++callresult2;
2838 }
2839 PyObject_Free(callresults);
2840 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002841 if (numberresults)
2842 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002843 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002844}
2845
Walter Dörwaldd2034312007-05-18 16:29:38 +00002846PyObject *
2847PyUnicode_FromFormat(const char *format, ...)
2848{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002849 PyObject* ret;
2850 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002851
2852#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002853 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002854#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002855 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002856#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002857 ret = PyUnicode_FromFormatV(format, vargs);
2858 va_end(vargs);
2859 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002860}
2861
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002862#ifdef HAVE_WCHAR_H
2863
Victor Stinner5593d8a2010-10-02 11:11:27 +00002864/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2865 convert a Unicode object to a wide character string.
2866
Victor Stinnerd88d9832011-09-06 02:00:05 +02002867 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002868 character) required to convert the unicode object. Ignore size argument.
2869
Victor Stinnerd88d9832011-09-06 02:00:05 +02002870 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002871 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002872 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002873static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002874unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002875 wchar_t *w,
2876 Py_ssize_t size)
2877{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002878 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002879 const wchar_t *wstr;
2880
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002881 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002882 if (wstr == NULL)
2883 return -1;
2884
Victor Stinner5593d8a2010-10-02 11:11:27 +00002885 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002886 if (size > res)
2887 size = res + 1;
2888 else
2889 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002890 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002891 return res;
2892 }
2893 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002894 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002895}
2896
2897Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002898PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002899 wchar_t *w,
2900 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002901{
2902 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002903 PyErr_BadInternalCall();
2904 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002905 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002906 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002907}
2908
Victor Stinner137c34c2010-09-29 10:25:54 +00002909wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002910PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002911 Py_ssize_t *size)
2912{
2913 wchar_t* buffer;
2914 Py_ssize_t buflen;
2915
2916 if (unicode == NULL) {
2917 PyErr_BadInternalCall();
2918 return NULL;
2919 }
2920
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002921 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002922 if (buflen == -1)
2923 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002924 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002925 PyErr_NoMemory();
2926 return NULL;
2927 }
2928
Victor Stinner137c34c2010-09-29 10:25:54 +00002929 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2930 if (buffer == NULL) {
2931 PyErr_NoMemory();
2932 return NULL;
2933 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002934 buflen = unicode_aswidechar(unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002935 if (buflen == -1)
2936 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002937 if (size != NULL)
2938 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002939 return buffer;
2940}
2941
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002942#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002943
Alexander Belopolsky40018472011-02-26 01:02:56 +00002944PyObject *
2945PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002946{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002947 PyObject *v;
Victor Stinner8faf8212011-12-08 22:14:11 +01002948 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002949 PyErr_SetString(PyExc_ValueError,
2950 "chr() arg not in range(0x110000)");
2951 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002952 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002953
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002954 if (ordinal < 256)
2955 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002956
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002957 v = PyUnicode_New(1, ordinal);
2958 if (v == NULL)
2959 return NULL;
2960 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002961 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002962 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002963}
2964
Alexander Belopolsky40018472011-02-26 01:02:56 +00002965PyObject *
2966PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002967{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002968 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002969 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002970 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002971 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002972 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002973 Py_INCREF(obj);
2974 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002975 }
2976 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002977 /* For a Unicode subtype that's not a Unicode object,
2978 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002979 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002980 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002981 PyErr_Format(PyExc_TypeError,
2982 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002983 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002984 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002985}
2986
Alexander Belopolsky40018472011-02-26 01:02:56 +00002987PyObject *
2988PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002989 const char *encoding,
2990 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002991{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002992 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002993 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002994
Guido van Rossumd57fd912000-03-10 22:53:23 +00002995 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002996 PyErr_BadInternalCall();
2997 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002998 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002999
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003000 /* Decoding bytes objects is the most common case and should be fast */
3001 if (PyBytes_Check(obj)) {
3002 if (PyBytes_GET_SIZE(obj) == 0) {
3003 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02003004 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003005 }
3006 else {
3007 v = PyUnicode_Decode(
3008 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3009 encoding, errors);
3010 }
3011 return v;
3012 }
3013
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003014 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003015 PyErr_SetString(PyExc_TypeError,
3016 "decoding str is not supported");
3017 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003018 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003019
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003020 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3021 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3022 PyErr_Format(PyExc_TypeError,
3023 "coercing to str: need bytes, bytearray "
3024 "or buffer-like object, %.80s found",
3025 Py_TYPE(obj)->tp_name);
3026 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003027 }
Tim Petersced69f82003-09-16 20:30:58 +00003028
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003029 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003030 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02003031 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003032 }
Tim Petersced69f82003-09-16 20:30:58 +00003033 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003034 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003035
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003036 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003037 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003038}
3039
Victor Stinner600d3be2010-06-10 12:00:55 +00003040/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00003041 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
3042 1 on success. */
3043static int
3044normalize_encoding(const char *encoding,
3045 char *lower,
3046 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003047{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003048 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003049 char *l;
3050 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003051
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04003052 if (encoding == NULL) {
3053 strcpy(lower, "utf-8");
3054 return 1;
3055 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003056 e = encoding;
3057 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003058 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00003059 while (*e) {
3060 if (l == l_end)
3061 return 0;
David Malcolm96960882010-11-05 17:23:41 +00003062 if (Py_ISUPPER(*e)) {
3063 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003064 }
3065 else if (*e == '_') {
3066 *l++ = '-';
3067 e++;
3068 }
3069 else {
3070 *l++ = *e++;
3071 }
3072 }
3073 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003074 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003075}
3076
Alexander Belopolsky40018472011-02-26 01:02:56 +00003077PyObject *
3078PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003079 Py_ssize_t size,
3080 const char *encoding,
3081 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003082{
3083 PyObject *buffer = NULL, *unicode;
3084 Py_buffer info;
3085 char lower[11]; /* Enough for any encoding shortcut */
3086
Fred Drakee4315f52000-05-09 19:53:39 +00003087 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003088 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003089 if ((strcmp(lower, "utf-8") == 0) ||
3090 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003091 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00003092 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003093 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003094 (strcmp(lower, "iso-8859-1") == 0))
3095 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003096#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003097 else if (strcmp(lower, "mbcs") == 0)
3098 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003099#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003100 else if (strcmp(lower, "ascii") == 0)
3101 return PyUnicode_DecodeASCII(s, size, errors);
3102 else if (strcmp(lower, "utf-16") == 0)
3103 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3104 else if (strcmp(lower, "utf-32") == 0)
3105 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3106 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003107
3108 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003109 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003110 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003111 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003112 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003113 if (buffer == NULL)
3114 goto onError;
3115 unicode = PyCodec_Decode(buffer, encoding, errors);
3116 if (unicode == NULL)
3117 goto onError;
3118 if (!PyUnicode_Check(unicode)) {
3119 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003120 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00003121 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003122 Py_DECREF(unicode);
3123 goto onError;
3124 }
3125 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003126 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003127
Benjamin Peterson29060642009-01-31 22:14:21 +00003128 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003129 Py_XDECREF(buffer);
3130 return NULL;
3131}
3132
Alexander Belopolsky40018472011-02-26 01:02:56 +00003133PyObject *
3134PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003135 const char *encoding,
3136 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003137{
3138 PyObject *v;
3139
3140 if (!PyUnicode_Check(unicode)) {
3141 PyErr_BadArgument();
3142 goto onError;
3143 }
3144
3145 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003146 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003147
3148 /* Decode via the codec registry */
3149 v = PyCodec_Decode(unicode, encoding, errors);
3150 if (v == NULL)
3151 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003152 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003153
Benjamin Peterson29060642009-01-31 22:14:21 +00003154 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003155 return NULL;
3156}
3157
Alexander Belopolsky40018472011-02-26 01:02:56 +00003158PyObject *
3159PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003160 const char *encoding,
3161 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003162{
3163 PyObject *v;
3164
3165 if (!PyUnicode_Check(unicode)) {
3166 PyErr_BadArgument();
3167 goto onError;
3168 }
3169
3170 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003171 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003172
3173 /* Decode via the codec registry */
3174 v = PyCodec_Decode(unicode, encoding, errors);
3175 if (v == NULL)
3176 goto onError;
3177 if (!PyUnicode_Check(v)) {
3178 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003179 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003180 Py_TYPE(v)->tp_name);
3181 Py_DECREF(v);
3182 goto onError;
3183 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003184 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003185
Benjamin Peterson29060642009-01-31 22:14:21 +00003186 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003187 return NULL;
3188}
3189
Alexander Belopolsky40018472011-02-26 01:02:56 +00003190PyObject *
3191PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003192 Py_ssize_t size,
3193 const char *encoding,
3194 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003195{
3196 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003197
Guido van Rossumd57fd912000-03-10 22:53:23 +00003198 unicode = PyUnicode_FromUnicode(s, size);
3199 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003200 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003201 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3202 Py_DECREF(unicode);
3203 return v;
3204}
3205
Alexander Belopolsky40018472011-02-26 01:02:56 +00003206PyObject *
3207PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003208 const char *encoding,
3209 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003210{
3211 PyObject *v;
3212
3213 if (!PyUnicode_Check(unicode)) {
3214 PyErr_BadArgument();
3215 goto onError;
3216 }
3217
3218 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003219 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003220
3221 /* Encode via the codec registry */
3222 v = PyCodec_Encode(unicode, encoding, errors);
3223 if (v == NULL)
3224 goto onError;
3225 return v;
3226
Benjamin Peterson29060642009-01-31 22:14:21 +00003227 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003228 return NULL;
3229}
3230
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003231static size_t
3232wcstombs_errorpos(const wchar_t *wstr)
3233{
3234 size_t len;
3235#if SIZEOF_WCHAR_T == 2
3236 wchar_t buf[3];
3237#else
3238 wchar_t buf[2];
3239#endif
3240 char outbuf[MB_LEN_MAX];
3241 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003242
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003243#if SIZEOF_WCHAR_T == 2
3244 buf[2] = 0;
3245#else
3246 buf[1] = 0;
3247#endif
3248 start = wstr;
3249 while (*wstr != L'\0')
3250 {
3251 previous = wstr;
3252#if SIZEOF_WCHAR_T == 2
3253 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3254 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3255 {
3256 buf[0] = wstr[0];
3257 buf[1] = wstr[1];
3258 wstr += 2;
3259 }
3260 else {
3261 buf[0] = *wstr;
3262 buf[1] = 0;
3263 wstr++;
3264 }
3265#else
3266 buf[0] = *wstr;
3267 wstr++;
3268#endif
3269 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003270 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003271 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003272 }
3273
3274 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003275 return 0;
3276}
3277
Victor Stinner1b579672011-12-17 05:47:23 +01003278static int
3279locale_error_handler(const char *errors, int *surrogateescape)
3280{
3281 if (errors == NULL) {
3282 *surrogateescape = 0;
3283 return 0;
3284 }
3285
3286 if (strcmp(errors, "strict") == 0) {
3287 *surrogateescape = 0;
3288 return 0;
3289 }
3290 if (strcmp(errors, "surrogateescape") == 0) {
3291 *surrogateescape = 1;
3292 return 0;
3293 }
3294 PyErr_Format(PyExc_ValueError,
3295 "only 'strict' and 'surrogateescape' error handlers "
3296 "are supported, not '%s'",
3297 errors);
3298 return -1;
3299}
3300
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003301PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003302PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003303{
3304 Py_ssize_t wlen, wlen2;
3305 wchar_t *wstr;
3306 PyObject *bytes = NULL;
3307 char *errmsg;
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003308 PyObject *reason;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003309 PyObject *exc;
3310 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003311 int surrogateescape;
3312
3313 if (locale_error_handler(errors, &surrogateescape) < 0)
3314 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003315
3316 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3317 if (wstr == NULL)
3318 return NULL;
3319
3320 wlen2 = wcslen(wstr);
3321 if (wlen2 != wlen) {
3322 PyMem_Free(wstr);
3323 PyErr_SetString(PyExc_TypeError, "embedded null character");
3324 return NULL;
3325 }
3326
3327 if (surrogateescape) {
3328 /* locale encoding with surrogateescape */
3329 char *str;
3330
3331 str = _Py_wchar2char(wstr, &error_pos);
3332 if (str == NULL) {
3333 if (error_pos == (size_t)-1) {
3334 PyErr_NoMemory();
3335 PyMem_Free(wstr);
3336 return NULL;
3337 }
3338 else {
3339 goto encode_error;
3340 }
3341 }
3342 PyMem_Free(wstr);
3343
3344 bytes = PyBytes_FromString(str);
3345 PyMem_Free(str);
3346 }
3347 else {
3348 size_t len, len2;
3349
3350 len = wcstombs(NULL, wstr, 0);
3351 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003352 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003353 goto encode_error;
3354 }
3355
3356 bytes = PyBytes_FromStringAndSize(NULL, len);
3357 if (bytes == NULL) {
3358 PyMem_Free(wstr);
3359 return NULL;
3360 }
3361
3362 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3363 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003364 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003365 goto encode_error;
3366 }
3367 PyMem_Free(wstr);
3368 }
3369 return bytes;
3370
3371encode_error:
3372 errmsg = strerror(errno);
3373 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003374
3375 if (error_pos == (size_t)-1)
3376 error_pos = wcstombs_errorpos(wstr);
3377
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003378 PyMem_Free(wstr);
3379 Py_XDECREF(bytes);
3380
Victor Stinner2f197072011-12-17 07:08:30 +01003381 if (errmsg != NULL) {
3382 size_t errlen;
3383 wstr = _Py_char2wchar(errmsg, &errlen);
3384 if (wstr != NULL) {
3385 reason = PyUnicode_FromWideChar(wstr, errlen);
3386 PyMem_Free(wstr);
3387 } else
3388 errmsg = NULL;
3389 }
3390 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003391 reason = PyUnicode_FromString(
3392 "wcstombs() encountered an unencodable "
3393 "wide character");
3394 if (reason == NULL)
3395 return NULL;
3396
3397 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3398 "locale", unicode,
3399 (Py_ssize_t)error_pos,
3400 (Py_ssize_t)(error_pos+1),
3401 reason);
3402 Py_DECREF(reason);
3403 if (exc != NULL) {
3404 PyCodec_StrictErrors(exc);
3405 Py_XDECREF(exc);
3406 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003407 return NULL;
3408}
3409
Victor Stinnerad158722010-10-27 00:25:46 +00003410PyObject *
3411PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003412{
Victor Stinner99b95382011-07-04 14:23:54 +02003413#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003414 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003415#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003416 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003417#else
Victor Stinner793b5312011-04-27 00:24:21 +02003418 PyInterpreterState *interp = PyThreadState_GET()->interp;
3419 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3420 cannot use it to encode and decode filenames before it is loaded. Load
3421 the Python codec requires to encode at least its own filename. Use the C
3422 version of the locale codec until the codec registry is initialized and
3423 the Python codec is loaded.
3424
3425 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3426 cannot only rely on it: check also interp->fscodec_initialized for
3427 subinterpreters. */
3428 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003429 return PyUnicode_AsEncodedString(unicode,
3430 Py_FileSystemDefaultEncoding,
3431 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003432 }
3433 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003434 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003435 }
Victor Stinnerad158722010-10-27 00:25:46 +00003436#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003437}
3438
Alexander Belopolsky40018472011-02-26 01:02:56 +00003439PyObject *
3440PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003441 const char *encoding,
3442 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003443{
3444 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003445 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003446
Guido van Rossumd57fd912000-03-10 22:53:23 +00003447 if (!PyUnicode_Check(unicode)) {
3448 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003449 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003450 }
Fred Drakee4315f52000-05-09 19:53:39 +00003451
Fred Drakee4315f52000-05-09 19:53:39 +00003452 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003453 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003454 if ((strcmp(lower, "utf-8") == 0) ||
3455 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003456 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003457 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003458 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003459 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003460 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003461 }
Victor Stinner37296e82010-06-10 13:36:23 +00003462 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003463 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003464 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003465 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003466#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003467 else if (strcmp(lower, "mbcs") == 0)
3468 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003469#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003470 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003471 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003472 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003473
3474 /* Encode via the codec registry */
3475 v = PyCodec_Encode(unicode, encoding, errors);
3476 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003477 return NULL;
3478
3479 /* The normal path */
3480 if (PyBytes_Check(v))
3481 return v;
3482
3483 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003484 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003485 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003486 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003487
3488 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3489 "encoder %s returned bytearray instead of bytes",
3490 encoding);
3491 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003492 Py_DECREF(v);
3493 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003494 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003495
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003496 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3497 Py_DECREF(v);
3498 return b;
3499 }
3500
3501 PyErr_Format(PyExc_TypeError,
3502 "encoder did not return a bytes object (type=%.400s)",
3503 Py_TYPE(v)->tp_name);
3504 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003505 return NULL;
3506}
3507
Alexander Belopolsky40018472011-02-26 01:02:56 +00003508PyObject *
3509PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003510 const char *encoding,
3511 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003512{
3513 PyObject *v;
3514
3515 if (!PyUnicode_Check(unicode)) {
3516 PyErr_BadArgument();
3517 goto onError;
3518 }
3519
3520 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003521 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003522
3523 /* Encode via the codec registry */
3524 v = PyCodec_Encode(unicode, encoding, errors);
3525 if (v == NULL)
3526 goto onError;
3527 if (!PyUnicode_Check(v)) {
3528 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003529 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003530 Py_TYPE(v)->tp_name);
3531 Py_DECREF(v);
3532 goto onError;
3533 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003534 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003535
Benjamin Peterson29060642009-01-31 22:14:21 +00003536 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003537 return NULL;
3538}
3539
Victor Stinner2f197072011-12-17 07:08:30 +01003540static size_t
3541mbstowcs_errorpos(const char *str, size_t len)
3542{
3543#ifdef HAVE_MBRTOWC
3544 const char *start = str;
3545 mbstate_t mbs;
3546 size_t converted;
3547 wchar_t ch;
3548
3549 memset(&mbs, 0, sizeof mbs);
3550 while (len)
3551 {
3552 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3553 if (converted == 0)
3554 /* Reached end of string */
3555 break;
3556 if (converted == (size_t)-1 || converted == (size_t)-2) {
3557 /* Conversion error or incomplete character */
3558 return str - start;
3559 }
3560 else {
3561 str += converted;
3562 len -= converted;
3563 }
3564 }
3565 /* failed to find the undecodable byte sequence */
3566 return 0;
3567#endif
3568 return 0;
3569}
3570
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003571PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003572PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003573 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003574{
3575 wchar_t smallbuf[256];
3576 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3577 wchar_t *wstr;
3578 size_t wlen, wlen2;
3579 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003580 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003581 size_t error_pos;
3582 char *errmsg;
3583 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003584
3585 if (locale_error_handler(errors, &surrogateescape) < 0)
3586 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003587
3588 if (str[len] != '\0' || len != strlen(str)) {
3589 PyErr_SetString(PyExc_TypeError, "embedded null character");
3590 return NULL;
3591 }
3592
3593 if (surrogateescape)
3594 {
3595 wstr = _Py_char2wchar(str, &wlen);
3596 if (wstr == NULL) {
3597 if (wlen == (size_t)-1)
3598 PyErr_NoMemory();
3599 else
3600 PyErr_SetFromErrno(PyExc_OSError);
3601 return NULL;
3602 }
3603
3604 unicode = PyUnicode_FromWideChar(wstr, wlen);
3605 PyMem_Free(wstr);
3606 }
3607 else {
3608#ifndef HAVE_BROKEN_MBSTOWCS
3609 wlen = mbstowcs(NULL, str, 0);
3610#else
3611 wlen = len;
3612#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003613 if (wlen == (size_t)-1)
3614 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003615 if (wlen+1 <= smallbuf_len) {
3616 wstr = smallbuf;
3617 }
3618 else {
3619 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3620 return PyErr_NoMemory();
3621
3622 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3623 if (!wstr)
3624 return PyErr_NoMemory();
3625 }
3626
3627 /* This shouldn't fail now */
3628 wlen2 = mbstowcs(wstr, str, wlen+1);
3629 if (wlen2 == (size_t)-1) {
3630 if (wstr != smallbuf)
3631 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003632 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003633 }
3634#ifdef HAVE_BROKEN_MBSTOWCS
3635 assert(wlen2 == wlen);
3636#endif
3637 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3638 if (wstr != smallbuf)
3639 PyMem_Free(wstr);
3640 }
3641 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003642
3643decode_error:
3644 errmsg = strerror(errno);
3645 assert(errmsg != NULL);
3646
3647 error_pos = mbstowcs_errorpos(str, len);
3648 if (errmsg != NULL) {
3649 size_t errlen;
3650 wstr = _Py_char2wchar(errmsg, &errlen);
3651 if (wstr != NULL) {
3652 reason = PyUnicode_FromWideChar(wstr, errlen);
3653 PyMem_Free(wstr);
3654 } else
3655 errmsg = NULL;
3656 }
3657 if (errmsg == NULL)
3658 reason = PyUnicode_FromString(
3659 "mbstowcs() encountered an invalid multibyte sequence");
3660 if (reason == NULL)
3661 return NULL;
3662
3663 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3664 "locale", str, len,
3665 (Py_ssize_t)error_pos,
3666 (Py_ssize_t)(error_pos+1),
3667 reason);
3668 Py_DECREF(reason);
3669 if (exc != NULL) {
3670 PyCodec_StrictErrors(exc);
3671 Py_XDECREF(exc);
3672 }
3673 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003674}
3675
3676PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003677PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003678{
3679 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003680 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003681}
3682
3683
3684PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003685PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003686 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003687 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3688}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003689
Christian Heimes5894ba72007-11-04 11:43:14 +00003690PyObject*
3691PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3692{
Victor Stinner99b95382011-07-04 14:23:54 +02003693#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003694 return PyUnicode_DecodeMBCS(s, size, NULL);
3695#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003696 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003697#else
Victor Stinner793b5312011-04-27 00:24:21 +02003698 PyInterpreterState *interp = PyThreadState_GET()->interp;
3699 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3700 cannot use it to encode and decode filenames before it is loaded. Load
3701 the Python codec requires to encode at least its own filename. Use the C
3702 version of the locale codec until the codec registry is initialized and
3703 the Python codec is loaded.
3704
3705 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3706 cannot only rely on it: check also interp->fscodec_initialized for
3707 subinterpreters. */
3708 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003709 return PyUnicode_Decode(s, size,
3710 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003711 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003712 }
3713 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003714 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003715 }
Victor Stinnerad158722010-10-27 00:25:46 +00003716#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003717}
3718
Martin v. Löwis011e8422009-05-05 04:43:17 +00003719
3720int
Antoine Pitrou13348842012-01-29 18:36:34 +01003721_PyUnicode_HasNULChars(PyObject* s)
3722{
3723 static PyObject *nul = NULL;
3724
3725 if (nul == NULL)
3726 nul = PyUnicode_FromStringAndSize("\0", 1);
3727 if (nul == NULL)
3728 return -1;
3729 return PyUnicode_Contains(s, nul);
3730}
3731
3732
3733int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003734PyUnicode_FSConverter(PyObject* arg, void* addr)
3735{
3736 PyObject *output = NULL;
3737 Py_ssize_t size;
3738 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003739 if (arg == NULL) {
3740 Py_DECREF(*(PyObject**)addr);
3741 return 1;
3742 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003743 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003744 output = arg;
3745 Py_INCREF(output);
3746 }
3747 else {
3748 arg = PyUnicode_FromObject(arg);
3749 if (!arg)
3750 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003751 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003752 Py_DECREF(arg);
3753 if (!output)
3754 return 0;
3755 if (!PyBytes_Check(output)) {
3756 Py_DECREF(output);
3757 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3758 return 0;
3759 }
3760 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003761 size = PyBytes_GET_SIZE(output);
3762 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003763 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003764 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003765 Py_DECREF(output);
3766 return 0;
3767 }
3768 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003769 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003770}
3771
3772
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003773int
3774PyUnicode_FSDecoder(PyObject* arg, void* addr)
3775{
3776 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003777 if (arg == NULL) {
3778 Py_DECREF(*(PyObject**)addr);
3779 return 1;
3780 }
3781 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003782 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003783 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003784 output = arg;
3785 Py_INCREF(output);
3786 }
3787 else {
3788 arg = PyBytes_FromObject(arg);
3789 if (!arg)
3790 return 0;
3791 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3792 PyBytes_GET_SIZE(arg));
3793 Py_DECREF(arg);
3794 if (!output)
3795 return 0;
3796 if (!PyUnicode_Check(output)) {
3797 Py_DECREF(output);
3798 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3799 return 0;
3800 }
3801 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003802 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003803 Py_DECREF(output);
3804 return 0;
3805 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003806 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003807 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003808 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3809 Py_DECREF(output);
3810 return 0;
3811 }
3812 *(PyObject**)addr = output;
3813 return Py_CLEANUP_SUPPORTED;
3814}
3815
3816
Martin v. Löwis5b222132007-06-10 09:51:05 +00003817char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003818PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003819{
Christian Heimesf3863112007-11-22 07:46:41 +00003820 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003821
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003822 if (!PyUnicode_Check(unicode)) {
3823 PyErr_BadArgument();
3824 return NULL;
3825 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003826 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003827 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003828
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003829 if (PyUnicode_UTF8(unicode) == NULL) {
3830 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003831 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3832 if (bytes == NULL)
3833 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003834 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3835 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003836 Py_DECREF(bytes);
3837 return NULL;
3838 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003839 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3840 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3841 PyBytes_AS_STRING(bytes),
3842 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003843 Py_DECREF(bytes);
3844 }
3845
3846 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003847 *psize = PyUnicode_UTF8_LENGTH(unicode);
3848 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003849}
3850
3851char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003852PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003853{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003854 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3855}
3856
3857#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02003858static int unicode_as_unicode_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003859#endif
3860
3861
3862Py_UNICODE *
3863PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3864{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003865 const unsigned char *one_byte;
3866#if SIZEOF_WCHAR_T == 4
3867 const Py_UCS2 *two_bytes;
3868#else
3869 const Py_UCS4 *four_bytes;
3870 const Py_UCS4 *ucs4_end;
3871 Py_ssize_t num_surrogates;
3872#endif
3873 wchar_t *w;
3874 wchar_t *wchar_end;
3875
3876 if (!PyUnicode_Check(unicode)) {
3877 PyErr_BadArgument();
3878 return NULL;
3879 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003880 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003881 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003882 assert(_PyUnicode_KIND(unicode) != 0);
3883 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003884
3885#ifdef Py_DEBUG
3886 ++unicode_as_unicode_calls;
3887#endif
3888
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003889 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003890#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003891 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3892 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003893 num_surrogates = 0;
3894
3895 for (; four_bytes < ucs4_end; ++four_bytes) {
3896 if (*four_bytes > 0xFFFF)
3897 ++num_surrogates;
3898 }
3899
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003900 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3901 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3902 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003903 PyErr_NoMemory();
3904 return NULL;
3905 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003906 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003907
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003908 w = _PyUnicode_WSTR(unicode);
3909 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3910 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003911 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3912 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003913 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003914 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003915 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3916 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003917 }
3918 else
3919 *w = *four_bytes;
3920
3921 if (w > wchar_end) {
3922 assert(0 && "Miscalculated string end");
3923 }
3924 }
3925 *w = 0;
3926#else
3927 /* sizeof(wchar_t) == 4 */
3928 Py_FatalError("Impossible unicode object state, wstr and str "
3929 "should share memory already.");
3930 return NULL;
3931#endif
3932 }
3933 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003934 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3935 (_PyUnicode_LENGTH(unicode) + 1));
3936 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003937 PyErr_NoMemory();
3938 return NULL;
3939 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003940 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3941 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3942 w = _PyUnicode_WSTR(unicode);
3943 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003944
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003945 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3946 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003947 for (; w < wchar_end; ++one_byte, ++w)
3948 *w = *one_byte;
3949 /* null-terminate the wstr */
3950 *w = 0;
3951 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003952 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003953#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003954 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003955 for (; w < wchar_end; ++two_bytes, ++w)
3956 *w = *two_bytes;
3957 /* null-terminate the wstr */
3958 *w = 0;
3959#else
3960 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003961 PyObject_FREE(_PyUnicode_WSTR(unicode));
3962 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003963 Py_FatalError("Impossible unicode object state, wstr "
3964 "and str should share memory already.");
3965 return NULL;
3966#endif
3967 }
3968 else {
3969 assert(0 && "This should never happen.");
3970 }
3971 }
3972 }
3973 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003974 *size = PyUnicode_WSTR_LENGTH(unicode);
3975 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003976}
3977
Alexander Belopolsky40018472011-02-26 01:02:56 +00003978Py_UNICODE *
3979PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003980{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003981 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003982}
3983
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003984
Alexander Belopolsky40018472011-02-26 01:02:56 +00003985Py_ssize_t
3986PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003987{
3988 if (!PyUnicode_Check(unicode)) {
3989 PyErr_BadArgument();
3990 goto onError;
3991 }
3992 return PyUnicode_GET_SIZE(unicode);
3993
Benjamin Peterson29060642009-01-31 22:14:21 +00003994 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003995 return -1;
3996}
3997
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003998Py_ssize_t
3999PyUnicode_GetLength(PyObject *unicode)
4000{
Victor Stinner5a706cf2011-10-02 00:36:53 +02004001 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004002 PyErr_BadArgument();
4003 return -1;
4004 }
4005
4006 return PyUnicode_GET_LENGTH(unicode);
4007}
4008
4009Py_UCS4
4010PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4011{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004012 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
4013 PyErr_BadArgument();
4014 return (Py_UCS4)-1;
4015 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004016 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004017 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004018 return (Py_UCS4)-1;
4019 }
4020 return PyUnicode_READ_CHAR(unicode, index);
4021}
4022
4023int
4024PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4025{
4026 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004027 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004028 return -1;
4029 }
Victor Stinner488fa492011-12-12 00:01:39 +01004030 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004031 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004032 PyErr_SetString(PyExc_IndexError, "string index out of range");
4033 return -1;
4034 }
Victor Stinner488fa492011-12-12 00:01:39 +01004035 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004036 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004037 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4038 PyErr_SetString(PyExc_ValueError, "character out of range");
4039 return -1;
4040 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004041 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4042 index, ch);
4043 return 0;
4044}
4045
Alexander Belopolsky40018472011-02-26 01:02:56 +00004046const char *
4047PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004048{
Victor Stinner42cb4622010-09-01 19:39:01 +00004049 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004050}
4051
Victor Stinner554f3f02010-06-16 23:33:54 +00004052/* create or adjust a UnicodeDecodeError */
4053static void
4054make_decode_exception(PyObject **exceptionObject,
4055 const char *encoding,
4056 const char *input, Py_ssize_t length,
4057 Py_ssize_t startpos, Py_ssize_t endpos,
4058 const char *reason)
4059{
4060 if (*exceptionObject == NULL) {
4061 *exceptionObject = PyUnicodeDecodeError_Create(
4062 encoding, input, length, startpos, endpos, reason);
4063 }
4064 else {
4065 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4066 goto onError;
4067 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4068 goto onError;
4069 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4070 goto onError;
4071 }
4072 return;
4073
4074onError:
4075 Py_DECREF(*exceptionObject);
4076 *exceptionObject = NULL;
4077}
4078
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004079/* error handling callback helper:
4080 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004081 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004082 and adjust various state variables.
4083 return 0 on success, -1 on error
4084*/
4085
Alexander Belopolsky40018472011-02-26 01:02:56 +00004086static int
4087unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004088 const char *encoding, const char *reason,
4089 const char **input, const char **inend, Py_ssize_t *startinpos,
4090 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004091 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004092{
Benjamin Peterson142957c2008-07-04 19:55:29 +00004093 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004094
4095 PyObject *restuple = NULL;
4096 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004097 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004098 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004099 Py_ssize_t requiredsize;
4100 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004101 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004102 int res = -1;
4103
Victor Stinner596a6c42011-11-09 00:02:18 +01004104 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
4105 outsize = PyUnicode_GET_LENGTH(*output);
4106 else
4107 outsize = _PyUnicode_WSTR_LENGTH(*output);
4108
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004109 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004110 *errorHandler = PyCodec_LookupError(errors);
4111 if (*errorHandler == NULL)
4112 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004113 }
4114
Victor Stinner554f3f02010-06-16 23:33:54 +00004115 make_decode_exception(exceptionObject,
4116 encoding,
4117 *input, *inend - *input,
4118 *startinpos, *endinpos,
4119 reason);
4120 if (*exceptionObject == NULL)
4121 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004122
4123 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4124 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004125 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004126 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004127 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004128 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004129 }
4130 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004131 goto onError;
Benjamin Petersonbac79492012-01-14 13:34:47 -05004132 if (PyUnicode_READY(repunicode) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004133 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004134
4135 /* Copy back the bytes variables, which might have been modified by the
4136 callback */
4137 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4138 if (!inputobj)
4139 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004140 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004141 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004142 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004143 *input = PyBytes_AS_STRING(inputobj);
4144 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004145 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004146 /* we can DECREF safely, as the exception has another reference,
4147 so the object won't go away. */
4148 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004149
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004150 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004151 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004152 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004153 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4154 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004155 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004156
Victor Stinner596a6c42011-11-09 00:02:18 +01004157 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
4158 /* need more space? (at least enough for what we
4159 have+the replacement+the rest of the string (starting
4160 at the new input position), so we won't have to check space
4161 when there are no errors in the rest of the string) */
4162 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
4163 requiredsize = *outpos + replen + insize-newpos;
4164 if (requiredsize > outsize) {
4165 if (requiredsize<2*outsize)
4166 requiredsize = 2*outsize;
4167 if (unicode_resize(output, requiredsize) < 0)
4168 goto onError;
4169 }
Victor Stinner1b487b42012-05-03 12:29:04 +02004170 if (unicode_widen(output, *outpos,
4171 PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004172 goto onError;
Victor Stinner596a6c42011-11-09 00:02:18 +01004173 copy_characters(*output, *outpos, repunicode, 0, replen);
4174 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004175 }
Victor Stinner596a6c42011-11-09 00:02:18 +01004176 else {
4177 wchar_t *repwstr;
4178 Py_ssize_t repwlen;
4179 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4180 if (repwstr == NULL)
4181 goto onError;
4182 /* need more space? (at least enough for what we
4183 have+the replacement+the rest of the string (starting
4184 at the new input position), so we won't have to check space
4185 when there are no errors in the rest of the string) */
4186 requiredsize = *outpos + repwlen + insize-newpos;
4187 if (requiredsize > outsize) {
4188 if (requiredsize < 2*outsize)
4189 requiredsize = 2*outsize;
4190 if (unicode_resize(output, requiredsize) < 0)
4191 goto onError;
4192 }
4193 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4194 *outpos += repwlen;
4195 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004196 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004197 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004198
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004199 /* we made it! */
4200 res = 0;
4201
Benjamin Peterson29060642009-01-31 22:14:21 +00004202 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004203 Py_XDECREF(restuple);
4204 return res;
4205}
4206
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004207/* --- UTF-7 Codec -------------------------------------------------------- */
4208
Antoine Pitrou244651a2009-05-04 18:56:13 +00004209/* See RFC2152 for details. We encode conservatively and decode liberally. */
4210
4211/* Three simple macros defining base-64. */
4212
4213/* Is c a base-64 character? */
4214
4215#define IS_BASE64(c) \
4216 (((c) >= 'A' && (c) <= 'Z') || \
4217 ((c) >= 'a' && (c) <= 'z') || \
4218 ((c) >= '0' && (c) <= '9') || \
4219 (c) == '+' || (c) == '/')
4220
4221/* given that c is a base-64 character, what is its base-64 value? */
4222
4223#define FROM_BASE64(c) \
4224 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4225 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4226 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4227 (c) == '+' ? 62 : 63)
4228
4229/* What is the base-64 character of the bottom 6 bits of n? */
4230
4231#define TO_BASE64(n) \
4232 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4233
4234/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4235 * decoded as itself. We are permissive on decoding; the only ASCII
4236 * byte not decoding to itself is the + which begins a base64
4237 * string. */
4238
4239#define DECODE_DIRECT(c) \
4240 ((c) <= 127 && (c) != '+')
4241
4242/* The UTF-7 encoder treats ASCII characters differently according to
4243 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4244 * the above). See RFC2152. This array identifies these different
4245 * sets:
4246 * 0 : "Set D"
4247 * alphanumeric and '(),-./:?
4248 * 1 : "Set O"
4249 * !"#$%&*;<=>@[]^_`{|}
4250 * 2 : "whitespace"
4251 * ht nl cr sp
4252 * 3 : special (must be base64 encoded)
4253 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4254 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004255
Tim Petersced69f82003-09-16 20:30:58 +00004256static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004257char utf7_category[128] = {
4258/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4259 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4260/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4261 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4262/* sp ! " # $ % & ' ( ) * + , - . / */
4263 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4264/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4265 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4266/* @ A B C D E F G H I J K L M N O */
4267 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4268/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4269 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4270/* ` a b c d e f g h i j k l m n o */
4271 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4272/* p q r s t u v w x y z { | } ~ del */
4273 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004274};
4275
Antoine Pitrou244651a2009-05-04 18:56:13 +00004276/* ENCODE_DIRECT: this character should be encoded as itself. The
4277 * answer depends on whether we are encoding set O as itself, and also
4278 * on whether we are encoding whitespace as itself. RFC2152 makes it
4279 * clear that the answers to these questions vary between
4280 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004281
Antoine Pitrou244651a2009-05-04 18:56:13 +00004282#define ENCODE_DIRECT(c, directO, directWS) \
4283 ((c) < 128 && (c) > 0 && \
4284 ((utf7_category[(c)] == 0) || \
4285 (directWS && (utf7_category[(c)] == 2)) || \
4286 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004287
Alexander Belopolsky40018472011-02-26 01:02:56 +00004288PyObject *
4289PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004290 Py_ssize_t size,
4291 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004292{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004293 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4294}
4295
Antoine Pitrou244651a2009-05-04 18:56:13 +00004296/* The decoder. The only state we preserve is our read position,
4297 * i.e. how many characters we have consumed. So if we end in the
4298 * middle of a shift sequence we have to back off the read position
4299 * and the output to the beginning of the sequence, otherwise we lose
4300 * all the shift state (seen bits, number of bits seen, high
4301 * surrogate). */
4302
Alexander Belopolsky40018472011-02-26 01:02:56 +00004303PyObject *
4304PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004305 Py_ssize_t size,
4306 const char *errors,
4307 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004308{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004309 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004310 Py_ssize_t startinpos;
4311 Py_ssize_t endinpos;
4312 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004313 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004314 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004315 const char *errmsg = "";
4316 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004317 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004318 unsigned int base64bits = 0;
4319 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004320 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004321 PyObject *errorHandler = NULL;
4322 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004323
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004324 /* Start off assuming it's all ASCII. Widen later as necessary. */
4325 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004326 if (!unicode)
4327 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004328 if (size == 0) {
4329 if (consumed)
4330 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004331 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004332 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004333
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004334 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004335 e = s + size;
4336
4337 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004338 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004339 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004340 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004341
Antoine Pitrou244651a2009-05-04 18:56:13 +00004342 if (inShift) { /* in a base-64 section */
4343 if (IS_BASE64(ch)) { /* consume a base-64 character */
4344 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4345 base64bits += 6;
4346 s++;
4347 if (base64bits >= 16) {
4348 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004349 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004350 base64bits -= 16;
4351 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4352 if (surrogate) {
4353 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004354 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4355 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004356 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
4357 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004358 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004359 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004360 }
4361 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004362 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4363 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004364 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004365 }
4366 }
Victor Stinner551ac952011-11-29 22:58:13 +01004367 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004368 /* first surrogate */
4369 surrogate = outCh;
4370 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004371 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004372 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
4373 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004374 }
4375 }
4376 }
4377 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004378 inShift = 0;
4379 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004380 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004381 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4382 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004383 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004384 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004385 if (base64bits > 0) { /* left-over bits */
4386 if (base64bits >= 6) {
4387 /* We've seen at least one base-64 character */
4388 errmsg = "partial character in shift sequence";
4389 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004390 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004391 else {
4392 /* Some bits remain; they should be zero */
4393 if (base64buffer != 0) {
4394 errmsg = "non-zero padding bits in shift sequence";
4395 goto utf7Error;
4396 }
4397 }
4398 }
4399 if (ch != '-') {
4400 /* '-' is absorbed; other terminating
4401 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004402 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4403 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004404 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004405 }
4406 }
4407 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004408 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004409 s++; /* consume '+' */
4410 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004411 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004412 if (unicode_putchar(&unicode, &outpos, '+') < 0)
4413 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004414 }
4415 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004416 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004417 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004418 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004419 }
4420 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004421 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004422 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4423 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004424 s++;
4425 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004426 else {
4427 startinpos = s-starts;
4428 s++;
4429 errmsg = "unexpected special character";
4430 goto utf7Error;
4431 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004432 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004433utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004434 endinpos = s-starts;
4435 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00004436 errors, &errorHandler,
4437 "utf7", errmsg,
4438 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004439 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004440 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004441 }
4442
Antoine Pitrou244651a2009-05-04 18:56:13 +00004443 /* end of string */
4444
4445 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4446 /* if we're in an inconsistent state, that's an error */
4447 if (surrogate ||
4448 (base64bits >= 6) ||
4449 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004450 endinpos = size;
4451 if (unicode_decode_call_errorhandler(
4452 errors, &errorHandler,
4453 "utf7", "unterminated shift sequence",
4454 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004455 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004456 goto onError;
4457 if (s < e)
4458 goto restart;
4459 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004460 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004461
4462 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004463 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004464 if (inShift) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004465 outpos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004466 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004467 }
4468 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004469 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004470 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004471 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004472
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004473 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004474 goto onError;
4475
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004476 Py_XDECREF(errorHandler);
4477 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004478 return unicode_result(unicode);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004479
Benjamin Peterson29060642009-01-31 22:14:21 +00004480 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004481 Py_XDECREF(errorHandler);
4482 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004483 Py_DECREF(unicode);
4484 return NULL;
4485}
4486
4487
Alexander Belopolsky40018472011-02-26 01:02:56 +00004488PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004489_PyUnicode_EncodeUTF7(PyObject *str,
4490 int base64SetO,
4491 int base64WhiteSpace,
4492 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004493{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004494 int kind;
4495 void *data;
4496 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004497 PyObject *v;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004498 Py_ssize_t allocated;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004499 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004500 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004501 unsigned int base64bits = 0;
4502 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004503 char * out;
4504 char * start;
4505
Benjamin Petersonbac79492012-01-14 13:34:47 -05004506 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004507 return NULL;
4508 kind = PyUnicode_KIND(str);
4509 data = PyUnicode_DATA(str);
4510 len = PyUnicode_GET_LENGTH(str);
4511
4512 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004513 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004514
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004515 /* It might be possible to tighten this worst case */
4516 allocated = 8 * len;
4517 if (allocated / 8 != len)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004518 return PyErr_NoMemory();
4519
Antoine Pitrou244651a2009-05-04 18:56:13 +00004520 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004521 if (v == NULL)
4522 return NULL;
4523
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004524 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004525 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004526 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004527
Antoine Pitrou244651a2009-05-04 18:56:13 +00004528 if (inShift) {
4529 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4530 /* shifting out */
4531 if (base64bits) { /* output remaining bits */
4532 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4533 base64buffer = 0;
4534 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004535 }
4536 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004537 /* Characters not in the BASE64 set implicitly unshift the sequence
4538 so no '-' is required, except if the character is itself a '-' */
4539 if (IS_BASE64(ch) || ch == '-') {
4540 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004541 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004542 *out++ = (char) ch;
4543 }
4544 else {
4545 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004546 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004547 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004548 else { /* not in a shift sequence */
4549 if (ch == '+') {
4550 *out++ = '+';
4551 *out++ = '-';
4552 }
4553 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4554 *out++ = (char) ch;
4555 }
4556 else {
4557 *out++ = '+';
4558 inShift = 1;
4559 goto encode_char;
4560 }
4561 }
4562 continue;
4563encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004564 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004565 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004566
Antoine Pitrou244651a2009-05-04 18:56:13 +00004567 /* code first surrogate */
4568 base64bits += 16;
4569 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4570 while (base64bits >= 6) {
4571 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4572 base64bits -= 6;
4573 }
4574 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004575 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004576 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004577 base64bits += 16;
4578 base64buffer = (base64buffer << 16) | ch;
4579 while (base64bits >= 6) {
4580 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4581 base64bits -= 6;
4582 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004583 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004584 if (base64bits)
4585 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4586 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004587 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004588 if (_PyBytes_Resize(&v, out - start) < 0)
4589 return NULL;
4590 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004591}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004592PyObject *
4593PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4594 Py_ssize_t size,
4595 int base64SetO,
4596 int base64WhiteSpace,
4597 const char *errors)
4598{
4599 PyObject *result;
4600 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4601 if (tmp == NULL)
4602 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004603 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004604 base64WhiteSpace, errors);
4605 Py_DECREF(tmp);
4606 return result;
4607}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004608
Antoine Pitrou244651a2009-05-04 18:56:13 +00004609#undef IS_BASE64
4610#undef FROM_BASE64
4611#undef TO_BASE64
4612#undef DECODE_DIRECT
4613#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004614
Guido van Rossumd57fd912000-03-10 22:53:23 +00004615/* --- UTF-8 Codec -------------------------------------------------------- */
4616
Tim Petersced69f82003-09-16 20:30:58 +00004617static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004618char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004619 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4620 illegal prefix. See RFC 3629 for details */
4621 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4622 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004623 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004624 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4625 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4626 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4627 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004628 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4629 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004630 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4631 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004632 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4633 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4634 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4635 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4636 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004637};
4638
Alexander Belopolsky40018472011-02-26 01:02:56 +00004639PyObject *
4640PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004641 Py_ssize_t size,
4642 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004643{
Walter Dörwald69652032004-09-07 20:24:22 +00004644 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4645}
4646
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004647#include "stringlib/ucs1lib.h"
4648#include "stringlib/codecs.h"
4649#include "stringlib/undef.h"
4650
4651#include "stringlib/ucs2lib.h"
4652#include "stringlib/codecs.h"
4653#include "stringlib/undef.h"
4654
4655#include "stringlib/ucs4lib.h"
4656#include "stringlib/codecs.h"
4657#include "stringlib/undef.h"
4658
Antoine Pitrouab868312009-01-10 15:40:25 +00004659/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4660#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4661
4662/* Mask to quickly check whether a C 'long' contains a
4663 non-ASCII, UTF8-encoded char. */
4664#if (SIZEOF_LONG == 8)
4665# define ASCII_CHAR_MASK 0x8080808080808080L
4666#elif (SIZEOF_LONG == 4)
4667# define ASCII_CHAR_MASK 0x80808080L
4668#else
4669# error C 'long' size should be either 4 or 8!
4670#endif
4671
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004672/* Scans a UTF-8 string and returns the maximum character to be expected
4673 and the size of the decoded unicode string.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004674
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004675 This function doesn't check for errors, these checks are performed in
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004676 PyUnicode_DecodeUTF8Stateful.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004677 */
4678static Py_UCS4
Victor Stinnera1d12bb2011-12-11 21:53:09 +01004679utf8_scanner(const unsigned char *p, Py_ssize_t string_size, Py_ssize_t *unicode_size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004680{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004681 Py_ssize_t char_count = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004682 const unsigned char *end = p + string_size;
4683 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004684
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004685 assert(unicode_size != NULL);
4686
4687 /* By having a cascade of independent loops which fallback onto each
4688 other, we minimize the amount of work done in the average loop
4689 iteration, and we also maximize the CPU's ability to predict
4690 branches correctly (because a given condition will have always the
4691 same boolean outcome except perhaps in the last iteration of the
4692 corresponding loop).
4693 In the general case this brings us rather close to decoding
4694 performance pre-PEP 393, despite the two-pass decoding.
4695
4696 Note that the pure ASCII loop is not duplicated once a non-ASCII
4697 character has been encountered. It is actually a pessimization (by
4698 a significant factor) to use this loop on text with many non-ASCII
4699 characters, and it is important to avoid bad performance on valid
4700 utf-8 data (invalid utf-8 being a different can of worms).
4701 */
4702
4703 /* ASCII */
4704 for (; p < end; ++p) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004705 /* Only check value if it's not a ASCII char... */
4706 if (*p < 0x80) {
4707 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4708 an explanation. */
4709 if (!((size_t) p & LONG_PTR_MASK)) {
4710 /* Help register allocation */
4711 register const unsigned char *_p = p;
4712 while (_p < aligned_end) {
4713 unsigned long value = *(unsigned long *) _p;
4714 if (value & ASCII_CHAR_MASK)
4715 break;
4716 _p += SIZEOF_LONG;
4717 char_count += SIZEOF_LONG;
4718 }
4719 p = _p;
4720 if (p == end)
4721 break;
4722 }
4723 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004724 if (*p < 0x80)
4725 ++char_count;
4726 else
4727 goto _ucs1loop;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004728 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004729 *unicode_size = char_count;
4730 return 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004731
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004732_ucs1loop:
4733 for (; p < end; ++p) {
4734 if (*p < 0xc4)
4735 char_count += ((*p & 0xc0) != 0x80);
4736 else
4737 goto _ucs2loop;
4738 }
4739 *unicode_size = char_count;
4740 return 255;
4741
4742_ucs2loop:
4743 for (; p < end; ++p) {
4744 if (*p < 0xf0)
4745 char_count += ((*p & 0xc0) != 0x80);
4746 else
4747 goto _ucs4loop;
4748 }
4749 *unicode_size = char_count;
4750 return 65535;
4751
4752_ucs4loop:
4753 for (; p < end; ++p) {
4754 char_count += ((*p & 0xc0) != 0x80);
4755 }
4756 *unicode_size = char_count;
4757 return 65537;
4758}
4759
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004760/* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
Victor Stinner785938e2011-12-11 20:09:03 +01004761 in case of errors. Implicit parameters: unicode, kind, data, onError.
4762 Potential resizing overallocates, so the result needs to shrink at the end.
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004763*/
Victor Stinner785938e2011-12-11 20:09:03 +01004764#define WRITE_MAYBE_FAIL(index, value) \
4765 do { \
4766 Py_ssize_t pos = index; \
4767 if (pos > PyUnicode_GET_LENGTH(unicode) && \
4768 unicode_resize(&unicode, pos + pos/8) < 0) \
4769 goto onError; \
4770 if (unicode_putchar(&unicode, &pos, value) < 0) \
4771 goto onError; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004772 } while (0)
4773
Victor Stinnerbf6e5602011-12-12 01:53:47 +01004774static PyObject *
Victor Stinner785938e2011-12-11 20:09:03 +01004775decode_utf8_errors(const char *starts,
4776 Py_ssize_t size,
4777 const char *errors,
4778 Py_ssize_t *consumed,
4779 const char *s,
4780 PyObject *unicode,
4781 Py_ssize_t i)
Walter Dörwald69652032004-09-07 20:24:22 +00004782{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004783 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004784 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004785 Py_ssize_t startinpos;
4786 Py_ssize_t endinpos;
Victor Stinner785938e2011-12-11 20:09:03 +01004787 const char *e = starts + size;
4788 const char *aligned_end;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004789 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004790 PyObject *errorHandler = NULL;
4791 PyObject *exc = NULL;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004792
Antoine Pitrouab868312009-01-10 15:40:25 +00004793 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004794
4795 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004796 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004797
4798 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004799 /* Fast path for runs of ASCII characters. Given that common UTF-8
4800 input will consist of an overwhelming majority of ASCII
4801 characters, we try to optimize for this case by checking
4802 as many characters as a C 'long' can contain.
4803 First, check if we can do an aligned read, as most CPUs have
4804 a penalty for unaligned reads.
4805 */
4806 if (!((size_t) s & LONG_PTR_MASK)) {
4807 /* Help register allocation */
4808 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004809 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004810 while (_s < aligned_end) {
4811 /* Read a whole long at a time (either 4 or 8 bytes),
4812 and do a fast unrolled copy if it only contains ASCII
4813 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004814 unsigned long value = *(unsigned long *) _s;
4815 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004816 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004817 WRITE_MAYBE_FAIL(_i+0, _s[0]);
4818 WRITE_MAYBE_FAIL(_i+1, _s[1]);
4819 WRITE_MAYBE_FAIL(_i+2, _s[2]);
4820 WRITE_MAYBE_FAIL(_i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004821#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004822 WRITE_MAYBE_FAIL(_i+4, _s[4]);
4823 WRITE_MAYBE_FAIL(_i+5, _s[5]);
4824 WRITE_MAYBE_FAIL(_i+6, _s[6]);
4825 WRITE_MAYBE_FAIL(_i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004826#endif
4827 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004828 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004829 }
4830 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004831 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004832 if (s == e)
4833 break;
4834 ch = (unsigned char)*s;
4835 }
4836 }
4837
4838 if (ch < 0x80) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004839 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004840 s++;
4841 continue;
4842 }
4843
4844 n = utf8_code_length[ch];
4845
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004846 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004847 if (consumed)
4848 break;
4849 else {
4850 errmsg = "unexpected end of data";
4851 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004852 endinpos = startinpos+1;
4853 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4854 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004855 goto utf8Error;
4856 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004857 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004858
4859 switch (n) {
4860
4861 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004862 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004863 startinpos = s-starts;
4864 endinpos = startinpos+1;
4865 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004866
4867 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004868 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004869 startinpos = s-starts;
4870 endinpos = startinpos+1;
4871 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004872
4873 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004874 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004875 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004876 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004877 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004878 goto utf8Error;
4879 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004880 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004881 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004882 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004883 break;
4884
4885 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004886 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4887 will result in surrogates in range d800-dfff. Surrogates are
4888 not valid UTF-8 so they are rejected.
4889 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4890 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004891 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004892 (s[2] & 0xc0) != 0x80 ||
4893 ((unsigned char)s[0] == 0xE0 &&
4894 (unsigned char)s[1] < 0xA0) ||
4895 ((unsigned char)s[0] == 0xED &&
4896 (unsigned char)s[1] > 0x9F)) {
4897 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004898 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004899 endinpos = startinpos + 1;
4900
4901 /* if s[1] first two bits are 1 and 0, then the invalid
4902 continuation byte is s[2], so increment endinpos by 1,
4903 if not, s[1] is invalid and endinpos doesn't need to
4904 be incremented. */
4905 if ((s[1] & 0xC0) == 0x80)
4906 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004907 goto utf8Error;
4908 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004909 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004910 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004911 WRITE_MAYBE_FAIL(i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004912 break;
4913
4914 case 4:
4915 if ((s[1] & 0xc0) != 0x80 ||
4916 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004917 (s[3] & 0xc0) != 0x80 ||
4918 ((unsigned char)s[0] == 0xF0 &&
4919 (unsigned char)s[1] < 0x90) ||
4920 ((unsigned char)s[0] == 0xF4 &&
4921 (unsigned char)s[1] > 0x8F)) {
4922 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004923 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004924 endinpos = startinpos + 1;
4925 if ((s[1] & 0xC0) == 0x80) {
4926 endinpos++;
4927 if ((s[2] & 0xC0) == 0x80)
4928 endinpos++;
4929 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004930 goto utf8Error;
4931 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004932 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004933 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01004934 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Ezio Melotti57221d02010-07-01 07:32:02 +00004935
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004936 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004937 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004938 }
4939 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004940 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004941
Benjamin Peterson29060642009-01-31 22:14:21 +00004942 utf8Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00004943 if (unicode_decode_call_errorhandler(
4944 errors, &errorHandler,
Victor Stinnercbe01342012-02-14 01:17:45 +01004945 "utf-8", errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00004946 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004947 &unicode, &i))
Benjamin Peterson29060642009-01-31 22:14:21 +00004948 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004949 /* Update data because unicode_decode_call_errorhandler might have
4950 re-created or resized the unicode object. */
Benjamin Peterson29060642009-01-31 22:14:21 +00004951 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004952 }
Walter Dörwald69652032004-09-07 20:24:22 +00004953 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004954 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004955
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004956 /* Adjust length and ready string when it contained errors and
4957 is of the old resizable kind. */
Victor Stinner785938e2011-12-11 20:09:03 +01004958 if (unicode_resize(&unicode, i) < 0)
4959 goto onError;
4960 unicode_adjust_maxchar(&unicode);
4961 if (unicode == NULL)
4962 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004963
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004964 Py_XDECREF(errorHandler);
4965 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004966 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01004967 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004968
Benjamin Peterson29060642009-01-31 22:14:21 +00004969 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004970 Py_XDECREF(errorHandler);
4971 Py_XDECREF(exc);
Victor Stinner785938e2011-12-11 20:09:03 +01004972 Py_XDECREF(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004973 return NULL;
4974}
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004975#undef WRITE_MAYBE_FAIL
Antoine Pitrouab868312009-01-10 15:40:25 +00004976
Victor Stinner785938e2011-12-11 20:09:03 +01004977PyObject *
4978PyUnicode_DecodeUTF8Stateful(const char *s,
4979 Py_ssize_t size,
4980 const char *errors,
4981 Py_ssize_t *consumed)
4982{
4983 Py_UCS4 maxchar = 0;
4984 Py_ssize_t unicode_size;
4985 int has_errors = 0;
4986 PyObject *unicode;
4987 int kind;
4988 void *data;
4989 const char *starts = s;
4990 const char *e;
4991 Py_ssize_t i;
4992
4993 if (size == 0) {
4994 if (consumed)
4995 *consumed = 0;
Victor Stinner382955f2011-12-11 21:44:00 +01004996 Py_INCREF(unicode_empty);
4997 return unicode_empty;
Victor Stinner785938e2011-12-11 20:09:03 +01004998 }
4999
Victor Stinnera1d12bb2011-12-11 21:53:09 +01005000 maxchar = utf8_scanner((const unsigned char *)s, size, &unicode_size);
Victor Stinner785938e2011-12-11 20:09:03 +01005001
5002 /* When the string is ASCII only, just use memcpy and return.
5003 unicode_size may be != size if there is an incomplete UTF-8
5004 sequence at the end of the ASCII block. */
5005 if (maxchar < 128 && size == unicode_size) {
5006 if (consumed)
5007 *consumed = size;
Victor Stinnerab870212011-12-17 22:39:43 +01005008 return unicode_fromascii((const unsigned char *)s, size);
Victor Stinner785938e2011-12-11 20:09:03 +01005009 }
5010
5011 unicode = PyUnicode_New(unicode_size, maxchar);
5012 if (!unicode)
5013 return NULL;
5014 kind = PyUnicode_KIND(unicode);
5015 data = PyUnicode_DATA(unicode);
5016
5017 /* Unpack UTF-8 encoded data */
5018 i = 0;
5019 e = starts + size;
5020 switch (kind) {
5021 case PyUnicode_1BYTE_KIND:
5022 has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
5023 break;
5024 case PyUnicode_2BYTE_KIND:
5025 has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
5026 break;
5027 case PyUnicode_4BYTE_KIND:
5028 has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
5029 break;
5030 }
5031 if (!has_errors) {
5032 /* Ensure the unicode size calculation was correct */
5033 assert(i == unicode_size);
5034 assert(s == e);
5035 if (consumed)
5036 *consumed = size;
5037 return unicode;
5038 }
5039
5040 /* In case of errors, maxchar and size computation might be incorrect;
5041 code below refits and resizes as necessary. */
5042 return decode_utf8_errors(starts, size, errors, consumed, s, unicode, i);
5043}
5044
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005045#ifdef __APPLE__
5046
5047/* Simplified UTF-8 decoder using surrogateescape error handler,
5048 used to decode the command line arguments on Mac OS X. */
5049
5050wchar_t*
5051_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
5052{
5053 int n;
5054 const char *e;
5055 wchar_t *unicode, *p;
5056
5057 /* Note: size will always be longer than the resulting Unicode
5058 character count */
5059 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
5060 PyErr_NoMemory();
5061 return NULL;
5062 }
5063 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
5064 if (!unicode)
5065 return NULL;
5066
5067 /* Unpack UTF-8 encoded data */
5068 p = unicode;
5069 e = s + size;
5070 while (s < e) {
5071 Py_UCS4 ch = (unsigned char)*s;
5072
5073 if (ch < 0x80) {
5074 *p++ = (wchar_t)ch;
5075 s++;
5076 continue;
5077 }
5078
5079 n = utf8_code_length[ch];
5080 if (s + n > e) {
5081 goto surrogateescape;
5082 }
5083
5084 switch (n) {
5085 case 0:
5086 case 1:
5087 goto surrogateescape;
5088
5089 case 2:
5090 if ((s[1] & 0xc0) != 0x80)
5091 goto surrogateescape;
5092 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
5093 assert ((ch > 0x007F) && (ch <= 0x07FF));
5094 *p++ = (wchar_t)ch;
5095 break;
5096
5097 case 3:
5098 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
5099 will result in surrogates in range d800-dfff. Surrogates are
5100 not valid UTF-8 so they are rejected.
5101 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
5102 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
5103 if ((s[1] & 0xc0) != 0x80 ||
5104 (s[2] & 0xc0) != 0x80 ||
5105 ((unsigned char)s[0] == 0xE0 &&
5106 (unsigned char)s[1] < 0xA0) ||
5107 ((unsigned char)s[0] == 0xED &&
5108 (unsigned char)s[1] > 0x9F)) {
5109
5110 goto surrogateescape;
5111 }
5112 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5113 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005114 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005115 break;
5116
5117 case 4:
5118 if ((s[1] & 0xc0) != 0x80 ||
5119 (s[2] & 0xc0) != 0x80 ||
5120 (s[3] & 0xc0) != 0x80 ||
5121 ((unsigned char)s[0] == 0xF0 &&
5122 (unsigned char)s[1] < 0x90) ||
5123 ((unsigned char)s[0] == 0xF4 &&
5124 (unsigned char)s[1] > 0x8F)) {
5125 goto surrogateescape;
5126 }
5127 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
5128 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01005129 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005130
5131#if SIZEOF_WCHAR_T == 4
5132 *p++ = (wchar_t)ch;
5133#else
5134 /* compute and append the two surrogates: */
Victor Stinner551ac952011-11-29 22:58:13 +01005135 *p++ = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5136 *p++ = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005137#endif
5138 break;
5139 }
5140 s += n;
5141 continue;
5142
5143 surrogateescape:
5144 *p++ = 0xDC00 + ch;
5145 s++;
5146 }
5147 *p = L'\0';
5148 return unicode;
5149}
5150
5151#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00005152
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005153/* Primary internal function which creates utf8 encoded bytes objects.
5154
5155 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005156 and allocate exactly as much space needed at the end. Else allocate the
5157 maximum possible needed (4 result bytes per Unicode character), and return
5158 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005159*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005160PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005161_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005162{
Victor Stinner6099a032011-12-18 14:22:26 +01005163 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005164 void *data;
5165 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005166
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005167 if (!PyUnicode_Check(unicode)) {
5168 PyErr_BadArgument();
5169 return NULL;
5170 }
5171
5172 if (PyUnicode_READY(unicode) == -1)
5173 return NULL;
5174
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005175 if (PyUnicode_UTF8(unicode))
5176 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5177 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005178
5179 kind = PyUnicode_KIND(unicode);
5180 data = PyUnicode_DATA(unicode);
5181 size = PyUnicode_GET_LENGTH(unicode);
5182
Benjamin Petersonead6b532011-12-20 17:23:42 -06005183 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005184 default:
5185 assert(0);
5186 case PyUnicode_1BYTE_KIND:
5187 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5188 assert(!PyUnicode_IS_ASCII(unicode));
5189 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5190 case PyUnicode_2BYTE_KIND:
5191 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5192 case PyUnicode_4BYTE_KIND:
5193 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005194 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005195}
5196
Alexander Belopolsky40018472011-02-26 01:02:56 +00005197PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005198PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5199 Py_ssize_t size,
5200 const char *errors)
5201{
5202 PyObject *v, *unicode;
5203
5204 unicode = PyUnicode_FromUnicode(s, size);
5205 if (unicode == NULL)
5206 return NULL;
5207 v = _PyUnicode_AsUTF8String(unicode, errors);
5208 Py_DECREF(unicode);
5209 return v;
5210}
5211
5212PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005213PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005214{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005215 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005216}
5217
Walter Dörwald41980ca2007-08-16 21:55:45 +00005218/* --- UTF-32 Codec ------------------------------------------------------- */
5219
5220PyObject *
5221PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005222 Py_ssize_t size,
5223 const char *errors,
5224 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005225{
5226 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5227}
5228
5229PyObject *
5230PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005231 Py_ssize_t size,
5232 const char *errors,
5233 int *byteorder,
5234 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005235{
5236 const char *starts = s;
5237 Py_ssize_t startinpos;
5238 Py_ssize_t endinpos;
5239 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005240 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005241 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005242 int bo = 0; /* assume native ordering by default */
5243 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005244 /* Offsets from q for retrieving bytes in the right order. */
5245#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5246 int iorder[] = {0, 1, 2, 3};
5247#else
5248 int iorder[] = {3, 2, 1, 0};
5249#endif
5250 PyObject *errorHandler = NULL;
5251 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005252
Walter Dörwald41980ca2007-08-16 21:55:45 +00005253 q = (unsigned char *)s;
5254 e = q + size;
5255
5256 if (byteorder)
5257 bo = *byteorder;
5258
5259 /* Check for BOM marks (U+FEFF) in the input and adjust current
5260 byte order setting accordingly. In native mode, the leading BOM
5261 mark is skipped, in all other modes, it is copied to the output
5262 stream as-is (giving a ZWNBSP character). */
5263 if (bo == 0) {
5264 if (size >= 4) {
5265 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00005266 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005267#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005268 if (bom == 0x0000FEFF) {
5269 q += 4;
5270 bo = -1;
5271 }
5272 else if (bom == 0xFFFE0000) {
5273 q += 4;
5274 bo = 1;
5275 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005276#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005277 if (bom == 0x0000FEFF) {
5278 q += 4;
5279 bo = 1;
5280 }
5281 else if (bom == 0xFFFE0000) {
5282 q += 4;
5283 bo = -1;
5284 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005285#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005286 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005287 }
5288
5289 if (bo == -1) {
5290 /* force LE */
5291 iorder[0] = 0;
5292 iorder[1] = 1;
5293 iorder[2] = 2;
5294 iorder[3] = 3;
5295 }
5296 else if (bo == 1) {
5297 /* force BE */
5298 iorder[0] = 3;
5299 iorder[1] = 2;
5300 iorder[2] = 1;
5301 iorder[3] = 0;
5302 }
5303
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005304 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005305 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005306 if (!unicode)
5307 return NULL;
5308 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005309 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005310 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005311
Walter Dörwald41980ca2007-08-16 21:55:45 +00005312 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005313 Py_UCS4 ch;
5314 /* remaining bytes at the end? (size should be divisible by 4) */
5315 if (e-q<4) {
5316 if (consumed)
5317 break;
5318 errmsg = "truncated data";
5319 startinpos = ((const char *)q)-starts;
5320 endinpos = ((const char *)e)-starts;
5321 goto utf32Error;
5322 /* The remaining input chars are ignored if the callback
5323 chooses to skip the input */
5324 }
5325 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5326 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005327
Benjamin Peterson29060642009-01-31 22:14:21 +00005328 if (ch >= 0x110000)
5329 {
5330 errmsg = "codepoint not in range(0x110000)";
5331 startinpos = ((const char *)q)-starts;
5332 endinpos = startinpos+4;
5333 goto utf32Error;
5334 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005335 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5336 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005337 q += 4;
5338 continue;
5339 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005340 if (unicode_decode_call_errorhandler(
5341 errors, &errorHandler,
5342 "utf32", errmsg,
5343 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005344 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005345 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005346 }
5347
5348 if (byteorder)
5349 *byteorder = bo;
5350
5351 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005352 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005353
5354 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005355 if (unicode_resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005356 goto onError;
5357
5358 Py_XDECREF(errorHandler);
5359 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005360 return unicode_result(unicode);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005361
Benjamin Peterson29060642009-01-31 22:14:21 +00005362 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005363 Py_DECREF(unicode);
5364 Py_XDECREF(errorHandler);
5365 Py_XDECREF(exc);
5366 return NULL;
5367}
5368
5369PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005370_PyUnicode_EncodeUTF32(PyObject *str,
5371 const char *errors,
5372 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005373{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005374 int kind;
5375 void *data;
5376 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005377 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005378 unsigned char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005379 Py_ssize_t nsize, bytesize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005380 /* Offsets from p for storing byte pairs in the right order. */
5381#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5382 int iorder[] = {0, 1, 2, 3};
5383#else
5384 int iorder[] = {3, 2, 1, 0};
5385#endif
5386
Benjamin Peterson29060642009-01-31 22:14:21 +00005387#define STORECHAR(CH) \
5388 do { \
5389 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5390 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5391 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5392 p[iorder[0]] = (CH) & 0xff; \
5393 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005394 } while(0)
5395
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005396 if (!PyUnicode_Check(str)) {
5397 PyErr_BadArgument();
5398 return NULL;
5399 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005400 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005401 return NULL;
5402 kind = PyUnicode_KIND(str);
5403 data = PyUnicode_DATA(str);
5404 len = PyUnicode_GET_LENGTH(str);
5405
5406 nsize = len + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005407 bytesize = nsize * 4;
5408 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005409 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005410 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005411 if (v == NULL)
5412 return NULL;
5413
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005414 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005415 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005416 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005417 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005418 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005419
5420 if (byteorder == -1) {
5421 /* force LE */
5422 iorder[0] = 0;
5423 iorder[1] = 1;
5424 iorder[2] = 2;
5425 iorder[3] = 3;
5426 }
5427 else if (byteorder == 1) {
5428 /* force BE */
5429 iorder[0] = 3;
5430 iorder[1] = 2;
5431 iorder[2] = 1;
5432 iorder[3] = 0;
5433 }
5434
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005435 for (i = 0; i < len; i++)
5436 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005437
5438 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005439 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005440#undef STORECHAR
5441}
5442
Alexander Belopolsky40018472011-02-26 01:02:56 +00005443PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005444PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5445 Py_ssize_t size,
5446 const char *errors,
5447 int byteorder)
5448{
5449 PyObject *result;
5450 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5451 if (tmp == NULL)
5452 return NULL;
5453 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5454 Py_DECREF(tmp);
5455 return result;
5456}
5457
5458PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005459PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005460{
Victor Stinnerb960b342011-11-20 19:12:52 +01005461 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005462}
5463
Guido van Rossumd57fd912000-03-10 22:53:23 +00005464/* --- UTF-16 Codec ------------------------------------------------------- */
5465
Tim Peters772747b2001-08-09 22:21:55 +00005466PyObject *
5467PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005468 Py_ssize_t size,
5469 const char *errors,
5470 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005471{
Walter Dörwald69652032004-09-07 20:24:22 +00005472 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5473}
5474
Antoine Pitrouab868312009-01-10 15:40:25 +00005475/* Two masks for fast checking of whether a C 'long' may contain
5476 UTF16-encoded surrogate characters. This is an efficient heuristic,
5477 assuming that non-surrogate characters with a code point >= 0x8000 are
5478 rare in most input.
5479 FAST_CHAR_MASK is used when the input is in native byte ordering,
5480 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005481*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005482#if (SIZEOF_LONG == 8)
5483# define FAST_CHAR_MASK 0x8000800080008000L
5484# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
Victor Stinnerafb52052012-04-05 22:54:49 +02005485# define STRIPPED_MASK 0x00FF00FF00FF00FFL
Antoine Pitrouab868312009-01-10 15:40:25 +00005486#elif (SIZEOF_LONG == 4)
5487# define FAST_CHAR_MASK 0x80008000L
5488# define SWAPPED_FAST_CHAR_MASK 0x00800080L
Victor Stinnerafb52052012-04-05 22:54:49 +02005489# define STRIPPED_MASK 0x00FF00FFL
Antoine Pitrouab868312009-01-10 15:40:25 +00005490#else
5491# error C 'long' size should be either 4 or 8!
5492#endif
5493
Walter Dörwald69652032004-09-07 20:24:22 +00005494PyObject *
5495PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005496 Py_ssize_t size,
5497 const char *errors,
5498 int *byteorder,
5499 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005500{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005501 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005502 Py_ssize_t startinpos;
5503 Py_ssize_t endinpos;
5504 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005505 PyObject *unicode;
Antoine Pitrouab868312009-01-10 15:40:25 +00005506 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005507 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005508 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005509 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005510 /* Offsets from q for retrieving byte pairs in the right order. */
5511#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5512 int ihi = 1, ilo = 0;
5513#else
5514 int ihi = 0, ilo = 1;
5515#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005516 PyObject *errorHandler = NULL;
5517 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005518
5519 /* Note: size will always be longer than the resulting Unicode
5520 character count */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005521 unicode = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005522 if (!unicode)
5523 return NULL;
5524 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005525 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005526 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005527
Tim Peters772747b2001-08-09 22:21:55 +00005528 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005529 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005530
5531 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005532 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005533
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005534 /* Check for BOM marks (U+FEFF) in the input and adjust current
5535 byte order setting accordingly. In native mode, the leading BOM
5536 mark is skipped, in all other modes, it is copied to the output
5537 stream as-is (giving a ZWNBSP character). */
5538 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005539 if (size >= 2) {
Victor Stinner24729f32011-11-10 20:31:37 +01005540 const Py_UCS4 bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005541#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005542 if (bom == 0xFEFF) {
5543 q += 2;
5544 bo = -1;
5545 }
5546 else if (bom == 0xFFFE) {
5547 q += 2;
5548 bo = 1;
5549 }
Tim Petersced69f82003-09-16 20:30:58 +00005550#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005551 if (bom == 0xFEFF) {
5552 q += 2;
5553 bo = 1;
5554 }
5555 else if (bom == 0xFFFE) {
5556 q += 2;
5557 bo = -1;
5558 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005559#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005560 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005561 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005562
Tim Peters772747b2001-08-09 22:21:55 +00005563 if (bo == -1) {
5564 /* force LE */
5565 ihi = 1;
5566 ilo = 0;
5567 }
5568 else if (bo == 1) {
5569 /* force BE */
5570 ihi = 0;
5571 ilo = 1;
5572 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005573#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5574 native_ordering = ilo < ihi;
5575#else
5576 native_ordering = ilo > ihi;
5577#endif
Tim Peters772747b2001-08-09 22:21:55 +00005578
Antoine Pitrouab868312009-01-10 15:40:25 +00005579 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005580 while (q < e) {
Victor Stinner24729f32011-11-10 20:31:37 +01005581 Py_UCS4 ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005582 /* First check for possible aligned read of a C 'long'. Unaligned
5583 reads are more expensive, better to defer to another iteration. */
5584 if (!((size_t) q & LONG_PTR_MASK)) {
5585 /* Fast path for runs of non-surrogate chars. */
5586 register const unsigned char *_q = q;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005587 int kind = PyUnicode_KIND(unicode);
5588 void *data = PyUnicode_DATA(unicode);
5589 while (_q < aligned_end) {
Victor Stinnerafb52052012-04-05 22:54:49 +02005590 unsigned long block = * (unsigned long *) _q;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005591 Py_UCS4 maxch;
5592 if (native_ordering) {
5593 /* Can use buffer directly */
Victor Stinnerafb52052012-04-05 22:54:49 +02005594 if (block & FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005595 break;
Antoine Pitrouab868312009-01-10 15:40:25 +00005596 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005597 else {
5598 /* Need to byte-swap */
Victor Stinnerafb52052012-04-05 22:54:49 +02005599 if (block & SWAPPED_FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005600 break;
Victor Stinnerafb52052012-04-05 22:54:49 +02005601 block = ((block >> 8) & STRIPPED_MASK) |
5602 ((block & STRIPPED_MASK) << 8);
Antoine Pitrouab868312009-01-10 15:40:25 +00005603 }
Victor Stinnerafb52052012-04-05 22:54:49 +02005604 maxch = (Py_UCS2)(block & 0xFFFF);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005605#if SIZEOF_LONG == 8
Victor Stinnerafb52052012-04-05 22:54:49 +02005606 ch = (Py_UCS2)((block >> 16) & 0xFFFF);
Victor Stinnere6abb482012-05-02 01:15:40 +02005607 maxch = MAX_MAXCHAR(maxch, ch);
Victor Stinnerafb52052012-04-05 22:54:49 +02005608 ch = (Py_UCS2)((block >> 32) & 0xFFFF);
Victor Stinnere6abb482012-05-02 01:15:40 +02005609 maxch = MAX_MAXCHAR(maxch, ch);
Victor Stinnerafb52052012-04-05 22:54:49 +02005610 ch = (Py_UCS2)(block >> 48);
Victor Stinnere6abb482012-05-02 01:15:40 +02005611 maxch = MAX_MAXCHAR(maxch, ch);
Victor Stinnerafb52052012-04-05 22:54:49 +02005612#else
5613 ch = (Py_UCS2)(block >> 16);
Victor Stinnere6abb482012-05-02 01:15:40 +02005614 maxch = MAX_MAXCHAR(maxch, ch);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005615#endif
5616 if (maxch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
Victor Stinner1b487b42012-05-03 12:29:04 +02005617 if (unicode_widen(&unicode, outpos, maxch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005618 goto onError;
5619 kind = PyUnicode_KIND(unicode);
5620 data = PyUnicode_DATA(unicode);
5621 }
Victor Stinnerafb52052012-04-05 22:54:49 +02005622#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5623 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block & 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005624#if SIZEOF_LONG == 8
Victor Stinnerafb52052012-04-05 22:54:49 +02005625 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 16) & 0xFFFF));
5626 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 32) & 0xFFFF));
5627 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 48)));
5628#else
5629 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block >> 16));
5630#endif
5631#else
5632#if SIZEOF_LONG == 8
5633 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 48)));
5634 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 32) & 0xFFFF));
5635 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 16) & 0xFFFF));
5636#else
5637 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block >> 16));
5638#endif
5639 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block & 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005640#endif
5641 _q += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00005642 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005643 q = _q;
5644 if (q >= e)
5645 break;
5646 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005647 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005648
Benjamin Peterson14339b62009-01-31 16:36:08 +00005649 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005650
Victor Stinner551ac952011-11-29 22:58:13 +01005651 if (!Py_UNICODE_IS_SURROGATE(ch)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005652 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5653 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005654 continue;
5655 }
5656
5657 /* UTF-16 code pair: */
5658 if (q > e) {
5659 errmsg = "unexpected end of data";
5660 startinpos = (((const char *)q) - 2) - starts;
5661 endinpos = ((const char *)e) + 1 - starts;
5662 goto utf16Error;
5663 }
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005664 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)) {
5665 Py_UCS4 ch2 = (q[ihi] << 8) | q[ilo];
Benjamin Peterson29060642009-01-31 22:14:21 +00005666 q += 2;
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005667 if (Py_UNICODE_IS_LOW_SURROGATE(ch2)) {
Victor Stinner62aa4d02011-11-09 00:03:45 +01005668 if (unicode_putchar(&unicode, &outpos,
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005669 Py_UNICODE_JOIN_SURROGATES(ch, ch2)) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005670 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005671 continue;
5672 }
5673 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005674 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005675 startinpos = (((const char *)q)-4)-starts;
5676 endinpos = startinpos+2;
5677 goto utf16Error;
5678 }
5679
Benjamin Peterson14339b62009-01-31 16:36:08 +00005680 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005681 errmsg = "illegal encoding";
5682 startinpos = (((const char *)q)-2)-starts;
5683 endinpos = startinpos+2;
5684 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005685
Benjamin Peterson29060642009-01-31 22:14:21 +00005686 utf16Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005687 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005688 errors,
5689 &errorHandler,
5690 "utf16", errmsg,
5691 &starts,
5692 (const char **)&e,
5693 &startinpos,
5694 &endinpos,
5695 &exc,
5696 (const char **)&q,
5697 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005698 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005699 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005700 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005701 /* remaining byte at the end? (size should be even) */
5702 if (e == q) {
5703 if (!consumed) {
5704 errmsg = "truncated data";
5705 startinpos = ((const char *)q) - starts;
5706 endinpos = ((const char *)e) + 1 - starts;
Antoine Pitrouab868312009-01-10 15:40:25 +00005707 if (unicode_decode_call_errorhandler(
5708 errors,
5709 &errorHandler,
5710 "utf16", errmsg,
5711 &starts,
5712 (const char **)&e,
5713 &startinpos,
5714 &endinpos,
5715 &exc,
5716 (const char **)&q,
5717 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005718 &outpos))
Antoine Pitrouab868312009-01-10 15:40:25 +00005719 goto onError;
5720 /* The remaining input chars are ignored if the callback
5721 chooses to skip the input */
5722 }
5723 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005724
5725 if (byteorder)
5726 *byteorder = bo;
5727
Walter Dörwald69652032004-09-07 20:24:22 +00005728 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005729 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005730
Guido van Rossumd57fd912000-03-10 22:53:23 +00005731 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005732 if (unicode_resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005733 goto onError;
5734
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005735 Py_XDECREF(errorHandler);
5736 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005737 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005738
Benjamin Peterson29060642009-01-31 22:14:21 +00005739 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005740 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005741 Py_XDECREF(errorHandler);
5742 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005743 return NULL;
5744}
5745
Antoine Pitrouab868312009-01-10 15:40:25 +00005746#undef FAST_CHAR_MASK
5747#undef SWAPPED_FAST_CHAR_MASK
5748
Tim Peters772747b2001-08-09 22:21:55 +00005749PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005750_PyUnicode_EncodeUTF16(PyObject *str,
5751 const char *errors,
5752 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005753{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005754 int kind;
5755 void *data;
5756 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005757 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005758 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005759 Py_ssize_t nsize, bytesize;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005760 Py_ssize_t i, pairs;
Tim Peters772747b2001-08-09 22:21:55 +00005761 /* Offsets from p for storing byte pairs in the right order. */
5762#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5763 int ihi = 1, ilo = 0;
5764#else
5765 int ihi = 0, ilo = 1;
5766#endif
5767
Benjamin Peterson29060642009-01-31 22:14:21 +00005768#define STORECHAR(CH) \
5769 do { \
5770 p[ihi] = ((CH) >> 8) & 0xff; \
5771 p[ilo] = (CH) & 0xff; \
5772 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005773 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005774
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005775 if (!PyUnicode_Check(str)) {
5776 PyErr_BadArgument();
5777 return NULL;
5778 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005779 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005780 return NULL;
5781 kind = PyUnicode_KIND(str);
5782 data = PyUnicode_DATA(str);
5783 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005784
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005785 pairs = 0;
5786 if (kind == PyUnicode_4BYTE_KIND)
5787 for (i = 0; i < len; i++)
5788 if (PyUnicode_READ(kind, data, i) >= 0x10000)
5789 pairs++;
5790 /* 2 * (len + pairs + (byteorder == 0)) */
5791 if (len > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005792 return PyErr_NoMemory();
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005793 nsize = len + pairs + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005794 bytesize = nsize * 2;
5795 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005796 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005797 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005798 if (v == NULL)
5799 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005800
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005801 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005802 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005803 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005804 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005805 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005806
5807 if (byteorder == -1) {
5808 /* force LE */
5809 ihi = 1;
5810 ilo = 0;
5811 }
5812 else if (byteorder == 1) {
5813 /* force BE */
5814 ihi = 0;
5815 ilo = 1;
5816 }
5817
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005818 for (i = 0; i < len; i++) {
5819 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5820 Py_UCS4 ch2 = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +00005821 if (ch >= 0x10000) {
Victor Stinner551ac952011-11-29 22:58:13 +01005822 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
5823 ch = Py_UNICODE_HIGH_SURROGATE(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00005824 }
Tim Peters772747b2001-08-09 22:21:55 +00005825 STORECHAR(ch);
5826 if (ch2)
5827 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005828 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005829
5830 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005831 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005832#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005833}
5834
Alexander Belopolsky40018472011-02-26 01:02:56 +00005835PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005836PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5837 Py_ssize_t size,
5838 const char *errors,
5839 int byteorder)
5840{
5841 PyObject *result;
5842 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5843 if (tmp == NULL)
5844 return NULL;
5845 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5846 Py_DECREF(tmp);
5847 return result;
5848}
5849
5850PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005851PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005852{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005853 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005854}
5855
5856/* --- Unicode Escape Codec ----------------------------------------------- */
5857
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005858/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5859 if all the escapes in the string make it still a valid ASCII string.
5860 Returns -1 if any escapes were found which cause the string to
5861 pop out of ASCII range. Otherwise returns the length of the
5862 required buffer to hold the string.
5863 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005864static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005865length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5866{
5867 const unsigned char *p = (const unsigned char *)s;
5868 const unsigned char *end = p + size;
5869 Py_ssize_t length = 0;
5870
5871 if (size < 0)
5872 return -1;
5873
5874 for (; p < end; ++p) {
5875 if (*p > 127) {
5876 /* Non-ASCII */
5877 return -1;
5878 }
5879 else if (*p != '\\') {
5880 /* Normal character */
5881 ++length;
5882 }
5883 else {
5884 /* Backslash-escape, check next char */
5885 ++p;
5886 /* Escape sequence reaches till end of string or
5887 non-ASCII follow-up. */
5888 if (p >= end || *p > 127)
5889 return -1;
5890 switch (*p) {
5891 case '\n':
5892 /* backslash + \n result in zero characters */
5893 break;
5894 case '\\': case '\'': case '\"':
5895 case 'b': case 'f': case 't':
5896 case 'n': case 'r': case 'v': case 'a':
5897 ++length;
5898 break;
5899 case '0': case '1': case '2': case '3':
5900 case '4': case '5': case '6': case '7':
5901 case 'x': case 'u': case 'U': case 'N':
5902 /* these do not guarantee ASCII characters */
5903 return -1;
5904 default:
5905 /* count the backslash + the other character */
5906 length += 2;
5907 }
5908 }
5909 }
5910 return length;
5911}
5912
Fredrik Lundh06d12682001-01-24 07:59:11 +00005913static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005914
Alexander Belopolsky40018472011-02-26 01:02:56 +00005915PyObject *
5916PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005917 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005918 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005919{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005920 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005921 Py_ssize_t startinpos;
5922 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005923 int j;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005924 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005925 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005926 char* message;
5927 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005928 PyObject *errorHandler = NULL;
5929 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005930 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005931 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005932
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005933 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005934
5935 /* After length_of_escaped_ascii_string() there are two alternatives,
5936 either the string is pure ASCII with named escapes like \n, etc.
5937 and we determined it's exact size (common case)
5938 or it contains \x, \u, ... escape sequences. then we create a
5939 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005940 if (len >= 0) {
5941 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005942 if (!v)
5943 goto onError;
5944 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005945 }
5946 else {
5947 /* Escaped strings will always be longer than the resulting
5948 Unicode string, so we start with size here and then reduce the
5949 length after conversion to the true value.
5950 (but if the error callback returns a long replacement string
5951 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005952 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005953 if (!v)
5954 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005955 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005956 }
5957
Guido van Rossumd57fd912000-03-10 22:53:23 +00005958 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005959 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005960 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005961 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005962
Guido van Rossumd57fd912000-03-10 22:53:23 +00005963 while (s < end) {
5964 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005965 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005966 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005967
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005968 /* The only case in which i == ascii_length is a backslash
5969 followed by a newline. */
5970 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005971
Guido van Rossumd57fd912000-03-10 22:53:23 +00005972 /* Non-escape characters are interpreted as Unicode ordinals */
5973 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005974 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5975 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005976 continue;
5977 }
5978
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005979 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005980 /* \ - Escapes */
5981 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005982 c = *s++;
5983 if (s > end)
5984 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005985
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005986 /* The only case in which i == ascii_length is a backslash
5987 followed by a newline. */
5988 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005989
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005990 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005991
Benjamin Peterson29060642009-01-31 22:14:21 +00005992 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005993#define WRITECHAR(ch) \
5994 do { \
5995 if (unicode_putchar(&v, &i, ch) < 0) \
5996 goto onError; \
5997 }while(0)
5998
Guido van Rossumd57fd912000-03-10 22:53:23 +00005999 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006000 case '\\': WRITECHAR('\\'); break;
6001 case '\'': WRITECHAR('\''); break;
6002 case '\"': WRITECHAR('\"'); break;
6003 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006004 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006005 case 'f': WRITECHAR('\014'); break;
6006 case 't': WRITECHAR('\t'); break;
6007 case 'n': WRITECHAR('\n'); break;
6008 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006009 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006010 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006011 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006012 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006013
Benjamin Peterson29060642009-01-31 22:14:21 +00006014 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006015 case '0': case '1': case '2': case '3':
6016 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00006017 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006018 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00006019 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006020 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00006021 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006022 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006023 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006024 break;
6025
Benjamin Peterson29060642009-01-31 22:14:21 +00006026 /* hex escapes */
6027 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006028 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006029 digits = 2;
6030 message = "truncated \\xXX escape";
6031 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006032
Benjamin Peterson29060642009-01-31 22:14:21 +00006033 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006034 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006035 digits = 4;
6036 message = "truncated \\uXXXX escape";
6037 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006038
Benjamin Peterson29060642009-01-31 22:14:21 +00006039 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006040 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006041 digits = 8;
6042 message = "truncated \\UXXXXXXXX escape";
6043 hexescape:
6044 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006045 if (s+digits>end) {
6046 endinpos = size;
6047 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00006048 errors, &errorHandler,
6049 "unicodeescape", "end of string in escape sequence",
6050 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006051 &v, &i))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006052 goto onError;
6053 goto nextByte;
6054 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006055 for (j = 0; j < digits; ++j) {
6056 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00006057 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006058 endinpos = (s+j+1)-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006059 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00006060 errors, &errorHandler,
6061 "unicodeescape", message,
6062 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006063 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00006064 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006065 len = PyUnicode_GET_LENGTH(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006066 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00006067 }
6068 chr = (chr<<4) & ~0xF;
6069 if (c >= '0' && c <= '9')
6070 chr += c - '0';
6071 else if (c >= 'a' && c <= 'f')
6072 chr += 10 + c - 'a';
6073 else
6074 chr += 10 + c - 'A';
6075 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006076 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00006077 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006078 /* _decoding_error will have already written into the
6079 target buffer. */
6080 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006081 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00006082 /* when we get here, chr is a 32-bit unicode character */
Victor Stinner8faf8212011-12-08 22:14:11 +01006083 if (chr <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006084 WRITECHAR(chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00006085 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006086 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006087 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00006088 errors, &errorHandler,
6089 "unicodeescape", "illegal Unicode character",
6090 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006091 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00006092 goto onError;
6093 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006094 break;
6095
Benjamin Peterson29060642009-01-31 22:14:21 +00006096 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006097 case 'N':
6098 message = "malformed \\N character escape";
6099 if (ucnhash_CAPI == NULL) {
6100 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006101 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6102 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00006103 if (ucnhash_CAPI == NULL)
6104 goto ucnhashError;
6105 }
6106 if (*s == '{') {
6107 const char *start = s+1;
6108 /* look for the closing brace */
6109 while (*s != '}' && s < end)
6110 s++;
6111 if (s > start && s < end && *s == '}') {
6112 /* found a name. look it up in the unicode database */
6113 message = "unknown Unicode character name";
6114 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006115 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03006116 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00006117 goto store;
6118 }
6119 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006120 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006121 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00006122 errors, &errorHandler,
6123 "unicodeescape", message,
6124 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006125 &v, &i))
Fredrik Lundhccc74732001-02-18 22:13:49 +00006126 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006127 break;
6128
6129 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00006130 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006131 message = "\\ at end of string";
6132 s--;
6133 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006134 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00006135 errors, &errorHandler,
6136 "unicodeescape", message,
6137 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006138 &v, &i))
Walter Dörwald8c077222002-03-25 11:16:18 +00006139 goto onError;
6140 }
6141 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006142 WRITECHAR('\\');
6143 WRITECHAR(s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00006144 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006145 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006146 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006147 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006148 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006150#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006151
Victor Stinner16e6a802011-12-12 13:24:15 +01006152 if (unicode_resize(&v, i) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006153 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006154 Py_XDECREF(errorHandler);
6155 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006156 return unicode_result(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00006157
Benjamin Peterson29060642009-01-31 22:14:21 +00006158 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00006159 PyErr_SetString(
6160 PyExc_UnicodeError,
6161 "\\N escapes not supported (can't load unicodedata module)"
6162 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00006163 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006164 Py_XDECREF(errorHandler);
6165 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00006166 return NULL;
6167
Benjamin Peterson29060642009-01-31 22:14:21 +00006168 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006169 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006170 Py_XDECREF(errorHandler);
6171 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006172 return NULL;
6173}
6174
6175/* Return a Unicode-Escape string version of the Unicode object.
6176
6177 If quotes is true, the string is enclosed in u"" or u'' quotes as
6178 appropriate.
6179
6180*/
6181
Alexander Belopolsky40018472011-02-26 01:02:56 +00006182PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006183PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006184{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006185 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006186 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006187 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006188 int kind;
6189 void *data;
6190 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006191
Thomas Wouters89f507f2006-12-13 04:49:30 +00006192 /* Initial allocation is based on the longest-possible unichr
6193 escape.
6194
6195 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
6196 unichr, so in this case it's the longest unichr escape. In
6197 narrow (UTF-16) builds this is five chars per source unichr
6198 since there are two unichrs in the surrogate pair, so in narrow
6199 (UTF-16) builds it's not the longest unichr escape.
6200
6201 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
6202 so in the narrow (UTF-16) build case it's the longest unichr
6203 escape.
6204 */
6205
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006206 if (!PyUnicode_Check(unicode)) {
6207 PyErr_BadArgument();
6208 return NULL;
6209 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006210 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006211 return NULL;
6212 len = PyUnicode_GET_LENGTH(unicode);
6213 kind = PyUnicode_KIND(unicode);
6214 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06006215 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006216 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
6217 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
6218 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
6219 }
6220
6221 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006222 return PyBytes_FromStringAndSize(NULL, 0);
6223
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006224 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006225 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006226
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006227 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00006228 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006229 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00006230 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006231 if (repr == NULL)
6232 return NULL;
6233
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006234 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006235
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006236 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006237 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006238
Walter Dörwald79e913e2007-05-12 11:08:06 +00006239 /* Escape backslashes */
6240 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006241 *p++ = '\\';
6242 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00006243 continue;
Tim Petersced69f82003-09-16 20:30:58 +00006244 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006245
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006246 /* Map 21-bit characters to '\U00xxxxxx' */
6247 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006248 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006249 *p++ = '\\';
6250 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006251 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
6252 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
6253 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6254 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6255 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6256 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6257 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6258 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00006259 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006260 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006261
Guido van Rossumd57fd912000-03-10 22:53:23 +00006262 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006263 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006264 *p++ = '\\';
6265 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006266 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6267 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6268 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6269 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006270 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006271
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006272 /* Map special whitespace to '\t', \n', '\r' */
6273 else if (ch == '\t') {
6274 *p++ = '\\';
6275 *p++ = 't';
6276 }
6277 else if (ch == '\n') {
6278 *p++ = '\\';
6279 *p++ = 'n';
6280 }
6281 else if (ch == '\r') {
6282 *p++ = '\\';
6283 *p++ = 'r';
6284 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006285
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006286 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006287 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006288 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006289 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006290 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6291 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006292 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006293
Guido van Rossumd57fd912000-03-10 22:53:23 +00006294 /* Copy everything else as-is */
6295 else
6296 *p++ = (char) ch;
6297 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006298
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006299 assert(p - PyBytes_AS_STRING(repr) > 0);
6300 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6301 return NULL;
6302 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006303}
6304
Alexander Belopolsky40018472011-02-26 01:02:56 +00006305PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006306PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6307 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006308{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006309 PyObject *result;
6310 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6311 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006312 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006313 result = PyUnicode_AsUnicodeEscapeString(tmp);
6314 Py_DECREF(tmp);
6315 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006316}
6317
6318/* --- Raw Unicode Escape Codec ------------------------------------------- */
6319
Alexander Belopolsky40018472011-02-26 01:02:56 +00006320PyObject *
6321PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006322 Py_ssize_t size,
6323 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006324{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006325 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006326 Py_ssize_t startinpos;
6327 Py_ssize_t endinpos;
6328 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006329 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006330 const char *end;
6331 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006332 PyObject *errorHandler = NULL;
6333 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006334
Guido van Rossumd57fd912000-03-10 22:53:23 +00006335 /* Escaped strings will always be longer than the resulting
6336 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006337 length after conversion to the true value. (But decoding error
6338 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006339 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006340 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006341 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006342 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006343 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006344 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006345 end = s + size;
6346 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006347 unsigned char c;
6348 Py_UCS4 x;
6349 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006350 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006351
Benjamin Peterson29060642009-01-31 22:14:21 +00006352 /* Non-escape characters are interpreted as Unicode ordinals */
6353 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006354 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6355 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006356 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006357 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006358 startinpos = s-starts;
6359
6360 /* \u-escapes are only interpreted iff the number of leading
6361 backslashes if odd */
6362 bs = s;
6363 for (;s < end;) {
6364 if (*s != '\\')
6365 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006366 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6367 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006368 }
6369 if (((s - bs) & 1) == 0 ||
6370 s >= end ||
6371 (*s != 'u' && *s != 'U')) {
6372 continue;
6373 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006374 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006375 count = *s=='u' ? 4 : 8;
6376 s++;
6377
6378 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006379 for (x = 0, i = 0; i < count; ++i, ++s) {
6380 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006381 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006382 endinpos = s-starts;
6383 if (unicode_decode_call_errorhandler(
6384 errors, &errorHandler,
6385 "rawunicodeescape", "truncated \\uXXXX",
6386 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006387 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006388 goto onError;
6389 goto nextByte;
6390 }
6391 x = (x<<4) & ~0xF;
6392 if (c >= '0' && c <= '9')
6393 x += c - '0';
6394 else if (c >= 'a' && c <= 'f')
6395 x += 10 + c - 'a';
6396 else
6397 x += 10 + c - 'A';
6398 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006399 if (x <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006400 if (unicode_putchar(&v, &outpos, x) < 0)
6401 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006402 } else {
6403 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006404 if (unicode_decode_call_errorhandler(
6405 errors, &errorHandler,
6406 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006407 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006408 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006409 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006410 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006411 nextByte:
6412 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006413 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006414 if (unicode_resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006415 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006416 Py_XDECREF(errorHandler);
6417 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006418 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00006419
Benjamin Peterson29060642009-01-31 22:14:21 +00006420 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006421 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006422 Py_XDECREF(errorHandler);
6423 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006424 return NULL;
6425}
6426
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006427
Alexander Belopolsky40018472011-02-26 01:02:56 +00006428PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006429PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006430{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006431 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006432 char *p;
6433 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006434 Py_ssize_t expandsize, pos;
6435 int kind;
6436 void *data;
6437 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006438
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006439 if (!PyUnicode_Check(unicode)) {
6440 PyErr_BadArgument();
6441 return NULL;
6442 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006443 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006444 return NULL;
6445 kind = PyUnicode_KIND(unicode);
6446 data = PyUnicode_DATA(unicode);
6447 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006448 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6449 bytes, and 1 byte characters 4. */
6450 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006451
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006452 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006453 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006454
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006455 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006456 if (repr == NULL)
6457 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006458 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006459 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006460
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006461 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006462 for (pos = 0; pos < len; pos++) {
6463 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006464 /* Map 32-bit characters to '\Uxxxxxxxx' */
6465 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006466 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006467 *p++ = '\\';
6468 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006469 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6470 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6471 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6472 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6473 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6474 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6475 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6476 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006477 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006478 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006479 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006480 *p++ = '\\';
6481 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006482 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6483 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6484 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6485 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006486 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006487 /* Copy everything else as-is */
6488 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006489 *p++ = (char) ch;
6490 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006491
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006492 assert(p > q);
6493 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006494 return NULL;
6495 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006496}
6497
Alexander Belopolsky40018472011-02-26 01:02:56 +00006498PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006499PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6500 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006501{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006502 PyObject *result;
6503 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6504 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006505 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006506 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6507 Py_DECREF(tmp);
6508 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006509}
6510
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006511/* --- Unicode Internal Codec ------------------------------------------- */
6512
Alexander Belopolsky40018472011-02-26 01:02:56 +00006513PyObject *
6514_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006515 Py_ssize_t size,
6516 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006517{
6518 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006519 Py_ssize_t startinpos;
6520 Py_ssize_t endinpos;
6521 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006522 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006523 const char *end;
6524 const char *reason;
6525 PyObject *errorHandler = NULL;
6526 PyObject *exc = NULL;
6527
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006528 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006529 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006530 1))
6531 return NULL;
6532
Thomas Wouters89f507f2006-12-13 04:49:30 +00006533 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006534 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006535 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006536 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006537 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006538 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006539 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006540 end = s + size;
6541
6542 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006543 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006544 Py_UCS4 ch;
6545 /* We copy the raw representation one byte at a time because the
6546 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006547 ((char *) &uch)[0] = s[0];
6548 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006549#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006550 ((char *) &uch)[2] = s[2];
6551 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006552#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006553 ch = uch;
6554
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006555 /* We have to sanity check the raw data, otherwise doom looms for
6556 some malformed UCS-4 data. */
6557 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006558#ifdef Py_UNICODE_WIDE
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006559 ch > 0x10ffff ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006560#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006561 end-s < Py_UNICODE_SIZE
6562 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006563 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006564 startinpos = s - starts;
6565 if (end-s < Py_UNICODE_SIZE) {
6566 endinpos = end-starts;
6567 reason = "truncated input";
6568 }
6569 else {
6570 endinpos = s - starts + Py_UNICODE_SIZE;
6571 reason = "illegal code point (> 0x10FFFF)";
6572 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006573 if (unicode_decode_call_errorhandler(
6574 errors, &errorHandler,
6575 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006576 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006577 &v, &outpos))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006578 goto onError;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006579 continue;
6580 }
6581
6582 s += Py_UNICODE_SIZE;
6583#ifndef Py_UNICODE_WIDE
Victor Stinner551ac952011-11-29 22:58:13 +01006584 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006585 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006586 Py_UNICODE uch2;
6587 ((char *) &uch2)[0] = s[0];
6588 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006589 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006590 {
Victor Stinner551ac952011-11-29 22:58:13 +01006591 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006592 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006593 }
6594 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006595#endif
6596
6597 if (unicode_putchar(&v, &outpos, ch) < 0)
6598 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006599 }
6600
Victor Stinner16e6a802011-12-12 13:24:15 +01006601 if (unicode_resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006602 goto onError;
6603 Py_XDECREF(errorHandler);
6604 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006605 return unicode_result(v);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006606
Benjamin Peterson29060642009-01-31 22:14:21 +00006607 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006608 Py_XDECREF(v);
6609 Py_XDECREF(errorHandler);
6610 Py_XDECREF(exc);
6611 return NULL;
6612}
6613
Guido van Rossumd57fd912000-03-10 22:53:23 +00006614/* --- Latin-1 Codec ------------------------------------------------------ */
6615
Alexander Belopolsky40018472011-02-26 01:02:56 +00006616PyObject *
6617PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006618 Py_ssize_t size,
6619 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006620{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006621 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006622 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006623}
6624
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006625/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006626static void
6627make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006628 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006629 PyObject *unicode,
6630 Py_ssize_t startpos, Py_ssize_t endpos,
6631 const char *reason)
6632{
6633 if (*exceptionObject == NULL) {
6634 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006635 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006636 encoding, unicode, startpos, endpos, reason);
6637 }
6638 else {
6639 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6640 goto onError;
6641 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6642 goto onError;
6643 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6644 goto onError;
6645 return;
6646 onError:
6647 Py_DECREF(*exceptionObject);
6648 *exceptionObject = NULL;
6649 }
6650}
6651
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006652/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006653static void
6654raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006655 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006656 PyObject *unicode,
6657 Py_ssize_t startpos, Py_ssize_t endpos,
6658 const char *reason)
6659{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006660 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006661 encoding, unicode, startpos, endpos, reason);
6662 if (*exceptionObject != NULL)
6663 PyCodec_StrictErrors(*exceptionObject);
6664}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006665
6666/* error handling callback helper:
6667 build arguments, call the callback and check the arguments,
6668 put the result into newpos and return the replacement string, which
6669 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006670static PyObject *
6671unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006672 PyObject **errorHandler,
6673 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006674 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006675 Py_ssize_t startpos, Py_ssize_t endpos,
6676 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006677{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006678 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006679 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006680 PyObject *restuple;
6681 PyObject *resunicode;
6682
6683 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006684 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006685 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006686 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006687 }
6688
Benjamin Petersonbac79492012-01-14 13:34:47 -05006689 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006690 return NULL;
6691 len = PyUnicode_GET_LENGTH(unicode);
6692
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006693 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006694 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006695 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006696 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006697
6698 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006699 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006700 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006701 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006702 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006703 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006704 Py_DECREF(restuple);
6705 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006706 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006707 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006708 &resunicode, newpos)) {
6709 Py_DECREF(restuple);
6710 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006711 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006712 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6713 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6714 Py_DECREF(restuple);
6715 return NULL;
6716 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006717 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006718 *newpos = len + *newpos;
6719 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006720 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6721 Py_DECREF(restuple);
6722 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006723 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006724 Py_INCREF(resunicode);
6725 Py_DECREF(restuple);
6726 return resunicode;
6727}
6728
Alexander Belopolsky40018472011-02-26 01:02:56 +00006729static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006730unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006731 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006732 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006733{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006734 /* input state */
6735 Py_ssize_t pos=0, size;
6736 int kind;
6737 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006738 /* output object */
6739 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006740 /* pointer into the output */
6741 char *str;
6742 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006743 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006744 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6745 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006746 PyObject *errorHandler = NULL;
6747 PyObject *exc = NULL;
6748 /* the following variable is used for caching string comparisons
6749 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6750 int known_errorHandler = -1;
6751
Benjamin Petersonbac79492012-01-14 13:34:47 -05006752 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006753 return NULL;
6754 size = PyUnicode_GET_LENGTH(unicode);
6755 kind = PyUnicode_KIND(unicode);
6756 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006757 /* allocate enough for a simple encoding without
6758 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006759 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006760 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006761 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006762 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006763 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006764 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006765 ressize = size;
6766
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006767 while (pos < size) {
6768 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006769
Benjamin Peterson29060642009-01-31 22:14:21 +00006770 /* can we encode this? */
6771 if (c<limit) {
6772 /* no overflow check, because we know that the space is enough */
6773 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006774 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006775 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006776 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006777 Py_ssize_t requiredsize;
6778 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006779 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006780 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006781 Py_ssize_t collstart = pos;
6782 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006783 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006784 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006785 ++collend;
6786 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6787 if (known_errorHandler==-1) {
6788 if ((errors==NULL) || (!strcmp(errors, "strict")))
6789 known_errorHandler = 1;
6790 else if (!strcmp(errors, "replace"))
6791 known_errorHandler = 2;
6792 else if (!strcmp(errors, "ignore"))
6793 known_errorHandler = 3;
6794 else if (!strcmp(errors, "xmlcharrefreplace"))
6795 known_errorHandler = 4;
6796 else
6797 known_errorHandler = 0;
6798 }
6799 switch (known_errorHandler) {
6800 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006801 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006802 goto onError;
6803 case 2: /* replace */
6804 while (collstart++<collend)
6805 *str++ = '?'; /* fall through */
6806 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006807 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006808 break;
6809 case 4: /* xmlcharrefreplace */
6810 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006811 /* determine replacement size */
6812 for (i = collstart, repsize = 0; i < collend; ++i) {
6813 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6814 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006815 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006816 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006817 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006818 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006819 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006820 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006821 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006822 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006823 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006824 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006825 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006826 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006827 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006828 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006829 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006830 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006831 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006832 if (requiredsize > ressize) {
6833 if (requiredsize<2*ressize)
6834 requiredsize = 2*ressize;
6835 if (_PyBytes_Resize(&res, requiredsize))
6836 goto onError;
6837 str = PyBytes_AS_STRING(res) + respos;
6838 ressize = requiredsize;
6839 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006840 /* generate replacement */
6841 for (i = collstart; i < collend; ++i) {
6842 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006843 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006844 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006845 break;
6846 default:
6847 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006848 encoding, reason, unicode, &exc,
6849 collstart, collend, &newpos);
6850 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006851 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006852 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006853 if (PyBytes_Check(repunicode)) {
6854 /* Directly copy bytes result to output. */
6855 repsize = PyBytes_Size(repunicode);
6856 if (repsize > 1) {
6857 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006858 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006859 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6860 Py_DECREF(repunicode);
6861 goto onError;
6862 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006863 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006864 ressize += repsize-1;
6865 }
6866 memcpy(str, PyBytes_AsString(repunicode), repsize);
6867 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006868 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006869 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006870 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006871 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006872 /* need more space? (at least enough for what we
6873 have+the replacement+the rest of the string, so
6874 we won't have to check space for encodable characters) */
6875 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006876 repsize = PyUnicode_GET_LENGTH(repunicode);
6877 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006878 if (requiredsize > ressize) {
6879 if (requiredsize<2*ressize)
6880 requiredsize = 2*ressize;
6881 if (_PyBytes_Resize(&res, requiredsize)) {
6882 Py_DECREF(repunicode);
6883 goto onError;
6884 }
6885 str = PyBytes_AS_STRING(res) + respos;
6886 ressize = requiredsize;
6887 }
6888 /* check if there is anything unencodable in the replacement
6889 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006890 for (i = 0; repsize-->0; ++i, ++str) {
6891 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006892 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006893 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006894 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006895 Py_DECREF(repunicode);
6896 goto onError;
6897 }
6898 *str = (char)c;
6899 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006900 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006901 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006902 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006903 }
6904 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006905 /* Resize if we allocated to much */
6906 size = str - PyBytes_AS_STRING(res);
6907 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006908 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006909 if (_PyBytes_Resize(&res, size) < 0)
6910 goto onError;
6911 }
6912
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006913 Py_XDECREF(errorHandler);
6914 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006915 return res;
6916
6917 onError:
6918 Py_XDECREF(res);
6919 Py_XDECREF(errorHandler);
6920 Py_XDECREF(exc);
6921 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006922}
6923
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006924/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006925PyObject *
6926PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006927 Py_ssize_t size,
6928 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006929{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006930 PyObject *result;
6931 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6932 if (unicode == NULL)
6933 return NULL;
6934 result = unicode_encode_ucs1(unicode, errors, 256);
6935 Py_DECREF(unicode);
6936 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006937}
6938
Alexander Belopolsky40018472011-02-26 01:02:56 +00006939PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006940_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006941{
6942 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006943 PyErr_BadArgument();
6944 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006945 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006946 if (PyUnicode_READY(unicode) == -1)
6947 return NULL;
6948 /* Fast path: if it is a one-byte string, construct
6949 bytes object directly. */
6950 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6951 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6952 PyUnicode_GET_LENGTH(unicode));
6953 /* Non-Latin-1 characters present. Defer to above function to
6954 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006955 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006956}
6957
6958PyObject*
6959PyUnicode_AsLatin1String(PyObject *unicode)
6960{
6961 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006962}
6963
6964/* --- 7-bit ASCII Codec -------------------------------------------------- */
6965
Alexander Belopolsky40018472011-02-26 01:02:56 +00006966PyObject *
6967PyUnicode_DecodeASCII(const char *s,
6968 Py_ssize_t size,
6969 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006970{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006971 const char *starts = s;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006972 PyObject *v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006973 int kind;
6974 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006975 Py_ssize_t startinpos;
6976 Py_ssize_t endinpos;
6977 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006978 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006979 int has_error;
6980 const unsigned char *p = (const unsigned char *)s;
6981 const unsigned char *end = p + size;
6982 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006983 PyObject *errorHandler = NULL;
6984 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006985
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006986 if (size == 0) {
6987 Py_INCREF(unicode_empty);
6988 return unicode_empty;
6989 }
6990
Guido van Rossumd57fd912000-03-10 22:53:23 +00006991 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006992 if (size == 1 && (unsigned char)s[0] < 128)
6993 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006994
Victor Stinner702c7342011-10-05 13:50:52 +02006995 has_error = 0;
6996 while (p < end && !has_error) {
6997 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6998 an explanation. */
6999 if (!((size_t) p & LONG_PTR_MASK)) {
7000 /* Help register allocation */
7001 register const unsigned char *_p = p;
7002 while (_p < aligned_end) {
7003 unsigned long value = *(unsigned long *) _p;
7004 if (value & ASCII_CHAR_MASK) {
7005 has_error = 1;
7006 break;
7007 }
7008 _p += SIZEOF_LONG;
7009 }
7010 if (_p == end)
7011 break;
7012 if (has_error)
7013 break;
7014 p = _p;
7015 }
7016 if (*p & 0x80) {
7017 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007018 break;
Victor Stinner702c7342011-10-05 13:50:52 +02007019 }
7020 else {
7021 ++p;
7022 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007023 }
Victor Stinner702c7342011-10-05 13:50:52 +02007024 if (!has_error)
7025 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00007026
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007027 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007028 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007029 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007030 kind = PyUnicode_KIND(v);
7031 data = PyUnicode_DATA(v);
7032 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007033 e = s + size;
7034 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007035 register unsigned char c = (unsigned char)*s;
7036 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007037 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00007038 ++s;
7039 }
7040 else {
7041 startinpos = s-starts;
7042 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00007043 if (unicode_decode_call_errorhandler(
7044 errors, &errorHandler,
7045 "ascii", "ordinal not in range(128)",
7046 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007047 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00007048 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007049 kind = PyUnicode_KIND(v);
7050 data = PyUnicode_DATA(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00007051 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007052 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007053 if (unicode_resize(&v, outpos) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007054 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007055 Py_XDECREF(errorHandler);
7056 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02007057 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01007058 return v;
Tim Petersced69f82003-09-16 20:30:58 +00007059
Benjamin Peterson29060642009-01-31 22:14:21 +00007060 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007061 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007062 Py_XDECREF(errorHandler);
7063 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007064 return NULL;
7065}
7066
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007067/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007068PyObject *
7069PyUnicode_EncodeASCII(const Py_UNICODE *p,
7070 Py_ssize_t size,
7071 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007072{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007073 PyObject *result;
7074 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7075 if (unicode == NULL)
7076 return NULL;
7077 result = unicode_encode_ucs1(unicode, errors, 128);
7078 Py_DECREF(unicode);
7079 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007080}
7081
Alexander Belopolsky40018472011-02-26 01:02:56 +00007082PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007083_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007084{
7085 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007086 PyErr_BadArgument();
7087 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007088 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007089 if (PyUnicode_READY(unicode) == -1)
7090 return NULL;
7091 /* Fast path: if it is an ASCII-only string, construct bytes object
7092 directly. Else defer to above function to raise the exception. */
7093 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
7094 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7095 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007096 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007097}
7098
7099PyObject *
7100PyUnicode_AsASCIIString(PyObject *unicode)
7101{
7102 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007103}
7104
Victor Stinner99b95382011-07-04 14:23:54 +02007105#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007106
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007107/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007108
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007109#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007110#define NEED_RETRY
7111#endif
7112
Victor Stinner3a50e702011-10-18 21:21:00 +02007113#ifndef WC_ERR_INVALID_CHARS
7114# define WC_ERR_INVALID_CHARS 0x0080
7115#endif
7116
7117static char*
7118code_page_name(UINT code_page, PyObject **obj)
7119{
7120 *obj = NULL;
7121 if (code_page == CP_ACP)
7122 return "mbcs";
7123 if (code_page == CP_UTF7)
7124 return "CP_UTF7";
7125 if (code_page == CP_UTF8)
7126 return "CP_UTF8";
7127
7128 *obj = PyBytes_FromFormat("cp%u", code_page);
7129 if (*obj == NULL)
7130 return NULL;
7131 return PyBytes_AS_STRING(*obj);
7132}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007133
Alexander Belopolsky40018472011-02-26 01:02:56 +00007134static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007135is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007136{
7137 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02007138 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007139
Victor Stinner3a50e702011-10-18 21:21:00 +02007140 if (!IsDBCSLeadByteEx(code_page, *curr))
7141 return 0;
7142
7143 prev = CharPrevExA(code_page, s, curr, 0);
7144 if (prev == curr)
7145 return 1;
7146 /* FIXME: This code is limited to "true" double-byte encodings,
7147 as it assumes an incomplete character consists of a single
7148 byte. */
7149 if (curr - prev == 2)
7150 return 1;
7151 if (!IsDBCSLeadByteEx(code_page, *prev))
7152 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007153 return 0;
7154}
7155
Victor Stinner3a50e702011-10-18 21:21:00 +02007156static DWORD
7157decode_code_page_flags(UINT code_page)
7158{
7159 if (code_page == CP_UTF7) {
7160 /* The CP_UTF7 decoder only supports flags=0 */
7161 return 0;
7162 }
7163 else
7164 return MB_ERR_INVALID_CHARS;
7165}
7166
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007167/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007168 * Decode a byte string from a Windows code page into unicode object in strict
7169 * mode.
7170 *
7171 * Returns consumed size if succeed, returns -2 on decode error, or raise a
7172 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007173 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007174static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007175decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007176 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007177 const char *in,
7178 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007179{
Victor Stinner3a50e702011-10-18 21:21:00 +02007180 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007181 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007182 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007183
7184 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007185 assert(insize > 0);
7186 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7187 if (outsize <= 0)
7188 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007189
7190 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007191 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007192 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007193 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007194 if (*v == NULL)
7195 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007196 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007197 }
7198 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007199 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007200 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007201 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007202 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007203 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007204 }
7205
7206 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007207 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7208 if (outsize <= 0)
7209 goto error;
7210 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007211
Victor Stinner3a50e702011-10-18 21:21:00 +02007212error:
7213 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7214 return -2;
7215 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007216 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007217}
7218
Victor Stinner3a50e702011-10-18 21:21:00 +02007219/*
7220 * Decode a byte string from a code page into unicode object with an error
7221 * handler.
7222 *
7223 * Returns consumed size if succeed, or raise a WindowsError or
7224 * UnicodeDecodeError exception and returns -1 on error.
7225 */
7226static int
7227decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007228 PyObject **v,
7229 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007230 const char *errors)
7231{
7232 const char *startin = in;
7233 const char *endin = in + size;
7234 const DWORD flags = decode_code_page_flags(code_page);
7235 /* Ideally, we should get reason from FormatMessage. This is the Windows
7236 2000 English version of the message. */
7237 const char *reason = "No mapping for the Unicode character exists "
7238 "in the target code page.";
7239 /* each step cannot decode more than 1 character, but a character can be
7240 represented as a surrogate pair */
7241 wchar_t buffer[2], *startout, *out;
7242 int insize, outsize;
7243 PyObject *errorHandler = NULL;
7244 PyObject *exc = NULL;
7245 PyObject *encoding_obj = NULL;
7246 char *encoding;
7247 DWORD err;
7248 int ret = -1;
7249
7250 assert(size > 0);
7251
7252 encoding = code_page_name(code_page, &encoding_obj);
7253 if (encoding == NULL)
7254 return -1;
7255
7256 if (errors == NULL || strcmp(errors, "strict") == 0) {
7257 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7258 UnicodeDecodeError. */
7259 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7260 if (exc != NULL) {
7261 PyCodec_StrictErrors(exc);
7262 Py_CLEAR(exc);
7263 }
7264 goto error;
7265 }
7266
7267 if (*v == NULL) {
7268 /* Create unicode object */
7269 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7270 PyErr_NoMemory();
7271 goto error;
7272 }
Victor Stinnerab595942011-12-17 04:59:06 +01007273 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007274 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007275 if (*v == NULL)
7276 goto error;
7277 startout = PyUnicode_AS_UNICODE(*v);
7278 }
7279 else {
7280 /* Extend unicode object */
7281 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7282 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7283 PyErr_NoMemory();
7284 goto error;
7285 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007286 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007287 goto error;
7288 startout = PyUnicode_AS_UNICODE(*v) + n;
7289 }
7290
7291 /* Decode the byte string character per character */
7292 out = startout;
7293 while (in < endin)
7294 {
7295 /* Decode a character */
7296 insize = 1;
7297 do
7298 {
7299 outsize = MultiByteToWideChar(code_page, flags,
7300 in, insize,
7301 buffer, Py_ARRAY_LENGTH(buffer));
7302 if (outsize > 0)
7303 break;
7304 err = GetLastError();
7305 if (err != ERROR_NO_UNICODE_TRANSLATION
7306 && err != ERROR_INSUFFICIENT_BUFFER)
7307 {
7308 PyErr_SetFromWindowsErr(0);
7309 goto error;
7310 }
7311 insize++;
7312 }
7313 /* 4=maximum length of a UTF-8 sequence */
7314 while (insize <= 4 && (in + insize) <= endin);
7315
7316 if (outsize <= 0) {
7317 Py_ssize_t startinpos, endinpos, outpos;
7318
7319 startinpos = in - startin;
7320 endinpos = startinpos + 1;
7321 outpos = out - PyUnicode_AS_UNICODE(*v);
7322 if (unicode_decode_call_errorhandler(
7323 errors, &errorHandler,
7324 encoding, reason,
7325 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007326 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007327 {
7328 goto error;
7329 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007330 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007331 }
7332 else {
7333 in += insize;
7334 memcpy(out, buffer, outsize * sizeof(wchar_t));
7335 out += outsize;
7336 }
7337 }
7338
7339 /* write a NUL character at the end */
7340 *out = 0;
7341
7342 /* Extend unicode object */
7343 outsize = out - startout;
7344 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007345 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007346 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01007347 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007348
7349error:
7350 Py_XDECREF(encoding_obj);
7351 Py_XDECREF(errorHandler);
7352 Py_XDECREF(exc);
7353 return ret;
7354}
7355
Victor Stinner3a50e702011-10-18 21:21:00 +02007356static PyObject *
7357decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007358 const char *s, Py_ssize_t size,
7359 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007360{
Victor Stinner76a31a62011-11-04 00:05:13 +01007361 PyObject *v = NULL;
7362 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007363
Victor Stinner3a50e702011-10-18 21:21:00 +02007364 if (code_page < 0) {
7365 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7366 return NULL;
7367 }
7368
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007369 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007370 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007371
Victor Stinner76a31a62011-11-04 00:05:13 +01007372 do
7373 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007374#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007375 if (size > INT_MAX) {
7376 chunk_size = INT_MAX;
7377 final = 0;
7378 done = 0;
7379 }
7380 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007381#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007382 {
7383 chunk_size = (int)size;
7384 final = (consumed == NULL);
7385 done = 1;
7386 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007387
Victor Stinner76a31a62011-11-04 00:05:13 +01007388 /* Skip trailing lead-byte unless 'final' is set */
7389 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
7390 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007391
Victor Stinner76a31a62011-11-04 00:05:13 +01007392 if (chunk_size == 0 && done) {
7393 if (v != NULL)
7394 break;
7395 Py_INCREF(unicode_empty);
7396 return unicode_empty;
7397 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007398
Victor Stinner76a31a62011-11-04 00:05:13 +01007399
7400 converted = decode_code_page_strict(code_page, &v,
7401 s, chunk_size);
7402 if (converted == -2)
7403 converted = decode_code_page_errors(code_page, &v,
7404 s, chunk_size,
7405 errors);
7406 assert(converted != 0);
7407
7408 if (converted < 0) {
7409 Py_XDECREF(v);
7410 return NULL;
7411 }
7412
7413 if (consumed)
7414 *consumed += converted;
7415
7416 s += converted;
7417 size -= converted;
7418 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007419
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007420 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007421}
7422
Alexander Belopolsky40018472011-02-26 01:02:56 +00007423PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007424PyUnicode_DecodeCodePageStateful(int code_page,
7425 const char *s,
7426 Py_ssize_t size,
7427 const char *errors,
7428 Py_ssize_t *consumed)
7429{
7430 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7431}
7432
7433PyObject *
7434PyUnicode_DecodeMBCSStateful(const char *s,
7435 Py_ssize_t size,
7436 const char *errors,
7437 Py_ssize_t *consumed)
7438{
7439 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7440}
7441
7442PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007443PyUnicode_DecodeMBCS(const char *s,
7444 Py_ssize_t size,
7445 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007446{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007447 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7448}
7449
Victor Stinner3a50e702011-10-18 21:21:00 +02007450static DWORD
7451encode_code_page_flags(UINT code_page, const char *errors)
7452{
7453 if (code_page == CP_UTF8) {
7454 if (winver.dwMajorVersion >= 6)
7455 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7456 and later */
7457 return WC_ERR_INVALID_CHARS;
7458 else
7459 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7460 return 0;
7461 }
7462 else if (code_page == CP_UTF7) {
7463 /* CP_UTF7 only supports flags=0 */
7464 return 0;
7465 }
7466 else {
7467 if (errors != NULL && strcmp(errors, "replace") == 0)
7468 return 0;
7469 else
7470 return WC_NO_BEST_FIT_CHARS;
7471 }
7472}
7473
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007474/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007475 * Encode a Unicode string to a Windows code page into a byte string in strict
7476 * mode.
7477 *
7478 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7479 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007480 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007481static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007482encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007483 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007484 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007485{
Victor Stinner554f3f02010-06-16 23:33:54 +00007486 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007487 BOOL *pusedDefaultChar = &usedDefaultChar;
7488 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007489 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007490 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007491 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007492 const DWORD flags = encode_code_page_flags(code_page, NULL);
7493 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007494 /* Create a substring so that we can get the UTF-16 representation
7495 of just the slice under consideration. */
7496 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007497
Martin v. Löwis3d325192011-11-04 18:23:06 +01007498 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007499
Victor Stinner3a50e702011-10-18 21:21:00 +02007500 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007501 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007502 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007503 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007504
Victor Stinner2fc507f2011-11-04 20:06:39 +01007505 substring = PyUnicode_Substring(unicode, offset, offset+len);
7506 if (substring == NULL)
7507 return -1;
7508 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7509 if (p == NULL) {
7510 Py_DECREF(substring);
7511 return -1;
7512 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01007513
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007514 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007515 outsize = WideCharToMultiByte(code_page, flags,
7516 p, size,
7517 NULL, 0,
7518 NULL, pusedDefaultChar);
7519 if (outsize <= 0)
7520 goto error;
7521 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007522 if (pusedDefaultChar && *pusedDefaultChar) {
7523 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007524 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007525 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007526
Victor Stinner3a50e702011-10-18 21:21:00 +02007527 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007528 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007529 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007530 if (*outbytes == NULL) {
7531 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007532 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007533 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007534 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007535 }
7536 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007537 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007538 const Py_ssize_t n = PyBytes_Size(*outbytes);
7539 if (outsize > PY_SSIZE_T_MAX - n) {
7540 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007541 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007542 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007543 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007544 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7545 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007546 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007547 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007548 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007549 }
7550
7551 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007552 outsize = WideCharToMultiByte(code_page, flags,
7553 p, size,
7554 out, outsize,
7555 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007556 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007557 if (outsize <= 0)
7558 goto error;
7559 if (pusedDefaultChar && *pusedDefaultChar)
7560 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007561 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007562
Victor Stinner3a50e702011-10-18 21:21:00 +02007563error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007564 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007565 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7566 return -2;
7567 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007568 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007569}
7570
Victor Stinner3a50e702011-10-18 21:21:00 +02007571/*
7572 * Encode a Unicode string to a Windows code page into a byte string using a
7573 * error handler.
7574 *
7575 * Returns consumed characters if succeed, or raise a WindowsError and returns
7576 * -1 on other error.
7577 */
7578static int
7579encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007580 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007581 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007582{
Victor Stinner3a50e702011-10-18 21:21:00 +02007583 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007584 Py_ssize_t pos = unicode_offset;
7585 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007586 /* Ideally, we should get reason from FormatMessage. This is the Windows
7587 2000 English version of the message. */
7588 const char *reason = "invalid character";
7589 /* 4=maximum length of a UTF-8 sequence */
7590 char buffer[4];
7591 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7592 Py_ssize_t outsize;
7593 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007594 PyObject *errorHandler = NULL;
7595 PyObject *exc = NULL;
7596 PyObject *encoding_obj = NULL;
7597 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007598 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007599 PyObject *rep;
7600 int ret = -1;
7601
7602 assert(insize > 0);
7603
7604 encoding = code_page_name(code_page, &encoding_obj);
7605 if (encoding == NULL)
7606 return -1;
7607
7608 if (errors == NULL || strcmp(errors, "strict") == 0) {
7609 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7610 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007611 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007612 if (exc != NULL) {
7613 PyCodec_StrictErrors(exc);
7614 Py_DECREF(exc);
7615 }
7616 Py_XDECREF(encoding_obj);
7617 return -1;
7618 }
7619
7620 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7621 pusedDefaultChar = &usedDefaultChar;
7622 else
7623 pusedDefaultChar = NULL;
7624
7625 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7626 PyErr_NoMemory();
7627 goto error;
7628 }
7629 outsize = insize * Py_ARRAY_LENGTH(buffer);
7630
7631 if (*outbytes == NULL) {
7632 /* Create string object */
7633 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7634 if (*outbytes == NULL)
7635 goto error;
7636 out = PyBytes_AS_STRING(*outbytes);
7637 }
7638 else {
7639 /* Extend string object */
7640 Py_ssize_t n = PyBytes_Size(*outbytes);
7641 if (n > PY_SSIZE_T_MAX - outsize) {
7642 PyErr_NoMemory();
7643 goto error;
7644 }
7645 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7646 goto error;
7647 out = PyBytes_AS_STRING(*outbytes) + n;
7648 }
7649
7650 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007651 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007652 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007653 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7654 wchar_t chars[2];
7655 int charsize;
7656 if (ch < 0x10000) {
7657 chars[0] = (wchar_t)ch;
7658 charsize = 1;
7659 }
7660 else {
7661 ch -= 0x10000;
7662 chars[0] = 0xd800 + (ch >> 10);
7663 chars[1] = 0xdc00 + (ch & 0x3ff);
7664 charsize = 2;
7665 }
7666
Victor Stinner3a50e702011-10-18 21:21:00 +02007667 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007668 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007669 buffer, Py_ARRAY_LENGTH(buffer),
7670 NULL, pusedDefaultChar);
7671 if (outsize > 0) {
7672 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7673 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007674 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007675 memcpy(out, buffer, outsize);
7676 out += outsize;
7677 continue;
7678 }
7679 }
7680 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7681 PyErr_SetFromWindowsErr(0);
7682 goto error;
7683 }
7684
Victor Stinner3a50e702011-10-18 21:21:00 +02007685 rep = unicode_encode_call_errorhandler(
7686 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007687 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007688 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007689 if (rep == NULL)
7690 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007691 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007692
7693 if (PyBytes_Check(rep)) {
7694 outsize = PyBytes_GET_SIZE(rep);
7695 if (outsize != 1) {
7696 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7697 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7698 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7699 Py_DECREF(rep);
7700 goto error;
7701 }
7702 out = PyBytes_AS_STRING(*outbytes) + offset;
7703 }
7704 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7705 out += outsize;
7706 }
7707 else {
7708 Py_ssize_t i;
7709 enum PyUnicode_Kind kind;
7710 void *data;
7711
Benjamin Petersonbac79492012-01-14 13:34:47 -05007712 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007713 Py_DECREF(rep);
7714 goto error;
7715 }
7716
7717 outsize = PyUnicode_GET_LENGTH(rep);
7718 if (outsize != 1) {
7719 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7720 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7721 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7722 Py_DECREF(rep);
7723 goto error;
7724 }
7725 out = PyBytes_AS_STRING(*outbytes) + offset;
7726 }
7727 kind = PyUnicode_KIND(rep);
7728 data = PyUnicode_DATA(rep);
7729 for (i=0; i < outsize; i++) {
7730 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7731 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007732 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007733 encoding, unicode,
7734 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007735 "unable to encode error handler result to ASCII");
7736 Py_DECREF(rep);
7737 goto error;
7738 }
7739 *out = (unsigned char)ch;
7740 out++;
7741 }
7742 }
7743 Py_DECREF(rep);
7744 }
7745 /* write a NUL byte */
7746 *out = 0;
7747 outsize = out - PyBytes_AS_STRING(*outbytes);
7748 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7749 if (_PyBytes_Resize(outbytes, outsize) < 0)
7750 goto error;
7751 ret = 0;
7752
7753error:
7754 Py_XDECREF(encoding_obj);
7755 Py_XDECREF(errorHandler);
7756 Py_XDECREF(exc);
7757 return ret;
7758}
7759
Victor Stinner3a50e702011-10-18 21:21:00 +02007760static PyObject *
7761encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007762 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007763 const char *errors)
7764{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007765 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007766 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007767 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007768 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007769
Benjamin Petersonbac79492012-01-14 13:34:47 -05007770 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007771 return NULL;
7772 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007773
Victor Stinner3a50e702011-10-18 21:21:00 +02007774 if (code_page < 0) {
7775 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7776 return NULL;
7777 }
7778
Martin v. Löwis3d325192011-11-04 18:23:06 +01007779 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007780 return PyBytes_FromStringAndSize(NULL, 0);
7781
Victor Stinner7581cef2011-11-03 22:32:33 +01007782 offset = 0;
7783 do
7784 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007785#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007786 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007787 chunks. */
7788 if (len > INT_MAX/2) {
7789 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007790 done = 0;
7791 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007792 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007793#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007794 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007795 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007796 done = 1;
7797 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007798
Victor Stinner76a31a62011-11-04 00:05:13 +01007799 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007800 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007801 errors);
7802 if (ret == -2)
7803 ret = encode_code_page_errors(code_page, &outbytes,
7804 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007805 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007806 if (ret < 0) {
7807 Py_XDECREF(outbytes);
7808 return NULL;
7809 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007810
Victor Stinner7581cef2011-11-03 22:32:33 +01007811 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007812 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007813 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007814
Victor Stinner3a50e702011-10-18 21:21:00 +02007815 return outbytes;
7816}
7817
7818PyObject *
7819PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7820 Py_ssize_t size,
7821 const char *errors)
7822{
Victor Stinner7581cef2011-11-03 22:32:33 +01007823 PyObject *unicode, *res;
7824 unicode = PyUnicode_FromUnicode(p, size);
7825 if (unicode == NULL)
7826 return NULL;
7827 res = encode_code_page(CP_ACP, unicode, errors);
7828 Py_DECREF(unicode);
7829 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007830}
7831
7832PyObject *
7833PyUnicode_EncodeCodePage(int code_page,
7834 PyObject *unicode,
7835 const char *errors)
7836{
Victor Stinner7581cef2011-11-03 22:32:33 +01007837 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007838}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007839
Alexander Belopolsky40018472011-02-26 01:02:56 +00007840PyObject *
7841PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007842{
7843 if (!PyUnicode_Check(unicode)) {
7844 PyErr_BadArgument();
7845 return NULL;
7846 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007847 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007848}
7849
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007850#undef NEED_RETRY
7851
Victor Stinner99b95382011-07-04 14:23:54 +02007852#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007853
Guido van Rossumd57fd912000-03-10 22:53:23 +00007854/* --- Character Mapping Codec -------------------------------------------- */
7855
Alexander Belopolsky40018472011-02-26 01:02:56 +00007856PyObject *
7857PyUnicode_DecodeCharmap(const char *s,
7858 Py_ssize_t size,
7859 PyObject *mapping,
7860 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007861{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007862 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007863 Py_ssize_t startinpos;
7864 Py_ssize_t endinpos;
7865 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007866 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007867 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007868 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007869 PyObject *errorHandler = NULL;
7870 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007871
Guido van Rossumd57fd912000-03-10 22:53:23 +00007872 /* Default to Latin-1 */
7873 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007874 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007875
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007876 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007877 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007878 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007879 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007880 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007881 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007882 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007883 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007884 Py_ssize_t maplen;
7885 enum PyUnicode_Kind kind;
7886 void *data;
7887 Py_UCS4 x;
7888
Benjamin Petersonbac79492012-01-14 13:34:47 -05007889 if (PyUnicode_READY(mapping) == -1)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007890 return NULL;
7891
7892 maplen = PyUnicode_GET_LENGTH(mapping);
7893 data = PyUnicode_DATA(mapping);
7894 kind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007895 while (s < e) {
7896 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007897
Benjamin Peterson29060642009-01-31 22:14:21 +00007898 if (ch < maplen)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007899 x = PyUnicode_READ(kind, data, ch);
7900 else
7901 x = 0xfffe; /* invalid value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007902
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007903 if (x == 0xfffe)
7904 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007905 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007906 startinpos = s-starts;
7907 endinpos = startinpos+1;
7908 if (unicode_decode_call_errorhandler(
7909 errors, &errorHandler,
7910 "charmap", "character maps to <undefined>",
7911 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007912 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007913 goto onError;
7914 }
7915 continue;
7916 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007917
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007918 if (unicode_putchar(&v, &outpos, x) < 0)
7919 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007920 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007921 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007922 }
7923 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007924 while (s < e) {
7925 unsigned char ch = *s;
7926 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007927
Benjamin Peterson29060642009-01-31 22:14:21 +00007928 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7929 w = PyLong_FromLong((long)ch);
7930 if (w == NULL)
7931 goto onError;
7932 x = PyObject_GetItem(mapping, w);
7933 Py_DECREF(w);
7934 if (x == NULL) {
7935 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7936 /* No mapping found means: mapping is undefined. */
7937 PyErr_Clear();
7938 x = Py_None;
7939 Py_INCREF(x);
7940 } else
7941 goto onError;
7942 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007943
Benjamin Peterson29060642009-01-31 22:14:21 +00007944 /* Apply mapping */
7945 if (PyLong_Check(x)) {
7946 long value = PyLong_AS_LONG(x);
7947 if (value < 0 || value > 65535) {
7948 PyErr_SetString(PyExc_TypeError,
7949 "character mapping must be in range(65536)");
7950 Py_DECREF(x);
7951 goto onError;
7952 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007953 if (unicode_putchar(&v, &outpos, value) < 0)
7954 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007955 }
7956 else if (x == Py_None) {
7957 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007958 startinpos = s-starts;
7959 endinpos = startinpos+1;
7960 if (unicode_decode_call_errorhandler(
7961 errors, &errorHandler,
7962 "charmap", "character maps to <undefined>",
7963 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007964 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007965 Py_DECREF(x);
7966 goto onError;
7967 }
7968 Py_DECREF(x);
7969 continue;
7970 }
7971 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007972 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007973
Benjamin Petersonbac79492012-01-14 13:34:47 -05007974 if (PyUnicode_READY(x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007975 goto onError;
7976 targetsize = PyUnicode_GET_LENGTH(x);
7977
7978 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007979 /* 1-1 mapping */
Victor Stinner62aa4d02011-11-09 00:03:45 +01007980 if (unicode_putchar(&v, &outpos,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007981 PyUnicode_READ_CHAR(x, 0)) < 0)
7982 goto onError;
7983 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007984 else if (targetsize > 1) {
7985 /* 1-n mapping */
7986 if (targetsize > extrachars) {
7987 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007988 Py_ssize_t needed = (targetsize - extrachars) + \
7989 (targetsize << 2);
7990 extrachars += needed;
7991 /* XXX overflow detection missing */
Victor Stinner16e6a802011-12-12 13:24:15 +01007992 if (unicode_resize(&v,
7993 PyUnicode_GET_LENGTH(v) + needed) < 0)
7994 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007995 Py_DECREF(x);
7996 goto onError;
7997 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007998 }
Victor Stinner1b487b42012-05-03 12:29:04 +02007999 if (unicode_widen(&v, outpos, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01008000 goto onError;
8001 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
8002 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00008003 extrachars -= targetsize;
8004 }
8005 /* 1-0 mapping: skip the character */
8006 }
8007 else {
8008 /* wrong return value */
8009 PyErr_SetString(PyExc_TypeError,
8010 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008011 Py_DECREF(x);
8012 goto onError;
8013 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008014 Py_DECREF(x);
8015 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008016 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008017 }
Victor Stinner16e6a802011-12-12 13:24:15 +01008018 if (unicode_resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01008019 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008020 Py_XDECREF(errorHandler);
8021 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008022 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00008023
Benjamin Peterson29060642009-01-31 22:14:21 +00008024 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008025 Py_XDECREF(errorHandler);
8026 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008027 Py_XDECREF(v);
8028 return NULL;
8029}
8030
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008031/* Charmap encoding: the lookup table */
8032
Alexander Belopolsky40018472011-02-26 01:02:56 +00008033struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008034 PyObject_HEAD
8035 unsigned char level1[32];
8036 int count2, count3;
8037 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008038};
8039
8040static PyObject*
8041encoding_map_size(PyObject *obj, PyObject* args)
8042{
8043 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008044 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008045 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008046}
8047
8048static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008049 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008050 PyDoc_STR("Return the size (in bytes) of this object") },
8051 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008052};
8053
8054static void
8055encoding_map_dealloc(PyObject* o)
8056{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008057 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008058}
8059
8060static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008061 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008062 "EncodingMap", /*tp_name*/
8063 sizeof(struct encoding_map), /*tp_basicsize*/
8064 0, /*tp_itemsize*/
8065 /* methods */
8066 encoding_map_dealloc, /*tp_dealloc*/
8067 0, /*tp_print*/
8068 0, /*tp_getattr*/
8069 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00008070 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008071 0, /*tp_repr*/
8072 0, /*tp_as_number*/
8073 0, /*tp_as_sequence*/
8074 0, /*tp_as_mapping*/
8075 0, /*tp_hash*/
8076 0, /*tp_call*/
8077 0, /*tp_str*/
8078 0, /*tp_getattro*/
8079 0, /*tp_setattro*/
8080 0, /*tp_as_buffer*/
8081 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8082 0, /*tp_doc*/
8083 0, /*tp_traverse*/
8084 0, /*tp_clear*/
8085 0, /*tp_richcompare*/
8086 0, /*tp_weaklistoffset*/
8087 0, /*tp_iter*/
8088 0, /*tp_iternext*/
8089 encoding_map_methods, /*tp_methods*/
8090 0, /*tp_members*/
8091 0, /*tp_getset*/
8092 0, /*tp_base*/
8093 0, /*tp_dict*/
8094 0, /*tp_descr_get*/
8095 0, /*tp_descr_set*/
8096 0, /*tp_dictoffset*/
8097 0, /*tp_init*/
8098 0, /*tp_alloc*/
8099 0, /*tp_new*/
8100 0, /*tp_free*/
8101 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008102};
8103
8104PyObject*
8105PyUnicode_BuildEncodingMap(PyObject* string)
8106{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008107 PyObject *result;
8108 struct encoding_map *mresult;
8109 int i;
8110 int need_dict = 0;
8111 unsigned char level1[32];
8112 unsigned char level2[512];
8113 unsigned char *mlevel1, *mlevel2, *mlevel3;
8114 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008115 int kind;
8116 void *data;
8117 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008118
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008119 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008120 PyErr_BadArgument();
8121 return NULL;
8122 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008123 kind = PyUnicode_KIND(string);
8124 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008125 memset(level1, 0xFF, sizeof level1);
8126 memset(level2, 0xFF, sizeof level2);
8127
8128 /* If there isn't a one-to-one mapping of NULL to \0,
8129 or if there are non-BMP characters, we need to use
8130 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008131 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008132 need_dict = 1;
8133 for (i = 1; i < 256; i++) {
8134 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008135 ch = PyUnicode_READ(kind, data, i);
8136 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008137 need_dict = 1;
8138 break;
8139 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008140 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008141 /* unmapped character */
8142 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008143 l1 = ch >> 11;
8144 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008145 if (level1[l1] == 0xFF)
8146 level1[l1] = count2++;
8147 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008148 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008149 }
8150
8151 if (count2 >= 0xFF || count3 >= 0xFF)
8152 need_dict = 1;
8153
8154 if (need_dict) {
8155 PyObject *result = PyDict_New();
8156 PyObject *key, *value;
8157 if (!result)
8158 return NULL;
8159 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008160 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008161 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008162 if (!key || !value)
8163 goto failed1;
8164 if (PyDict_SetItem(result, key, value) == -1)
8165 goto failed1;
8166 Py_DECREF(key);
8167 Py_DECREF(value);
8168 }
8169 return result;
8170 failed1:
8171 Py_XDECREF(key);
8172 Py_XDECREF(value);
8173 Py_DECREF(result);
8174 return NULL;
8175 }
8176
8177 /* Create a three-level trie */
8178 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8179 16*count2 + 128*count3 - 1);
8180 if (!result)
8181 return PyErr_NoMemory();
8182 PyObject_Init(result, &EncodingMapType);
8183 mresult = (struct encoding_map*)result;
8184 mresult->count2 = count2;
8185 mresult->count3 = count3;
8186 mlevel1 = mresult->level1;
8187 mlevel2 = mresult->level23;
8188 mlevel3 = mresult->level23 + 16*count2;
8189 memcpy(mlevel1, level1, 32);
8190 memset(mlevel2, 0xFF, 16*count2);
8191 memset(mlevel3, 0, 128*count3);
8192 count3 = 0;
8193 for (i = 1; i < 256; i++) {
8194 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008195 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008196 /* unmapped character */
8197 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008198 o1 = PyUnicode_READ(kind, data, i)>>11;
8199 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008200 i2 = 16*mlevel1[o1] + o2;
8201 if (mlevel2[i2] == 0xFF)
8202 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008203 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008204 i3 = 128*mlevel2[i2] + o3;
8205 mlevel3[i3] = i;
8206 }
8207 return result;
8208}
8209
8210static int
Victor Stinner22168992011-11-20 17:09:18 +01008211encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008212{
8213 struct encoding_map *map = (struct encoding_map*)mapping;
8214 int l1 = c>>11;
8215 int l2 = (c>>7) & 0xF;
8216 int l3 = c & 0x7F;
8217 int i;
8218
Victor Stinner22168992011-11-20 17:09:18 +01008219 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008220 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008221 if (c == 0)
8222 return 0;
8223 /* level 1*/
8224 i = map->level1[l1];
8225 if (i == 0xFF) {
8226 return -1;
8227 }
8228 /* level 2*/
8229 i = map->level23[16*i+l2];
8230 if (i == 0xFF) {
8231 return -1;
8232 }
8233 /* level 3 */
8234 i = map->level23[16*map->count2 + 128*i + l3];
8235 if (i == 0) {
8236 return -1;
8237 }
8238 return i;
8239}
8240
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008241/* Lookup the character ch in the mapping. If the character
8242 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008243 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008244static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008245charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008246{
Christian Heimes217cfd12007-12-02 14:31:20 +00008247 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008248 PyObject *x;
8249
8250 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008251 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008252 x = PyObject_GetItem(mapping, w);
8253 Py_DECREF(w);
8254 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008255 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8256 /* No mapping found means: mapping is undefined. */
8257 PyErr_Clear();
8258 x = Py_None;
8259 Py_INCREF(x);
8260 return x;
8261 } else
8262 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008263 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008264 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008265 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008266 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008267 long value = PyLong_AS_LONG(x);
8268 if (value < 0 || value > 255) {
8269 PyErr_SetString(PyExc_TypeError,
8270 "character mapping must be in range(256)");
8271 Py_DECREF(x);
8272 return NULL;
8273 }
8274 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008275 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008276 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008277 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008278 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008279 /* wrong return value */
8280 PyErr_Format(PyExc_TypeError,
8281 "character mapping must return integer, bytes or None, not %.400s",
8282 x->ob_type->tp_name);
8283 Py_DECREF(x);
8284 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008285 }
8286}
8287
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008288static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008289charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008290{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008291 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8292 /* exponentially overallocate to minimize reallocations */
8293 if (requiredsize < 2*outsize)
8294 requiredsize = 2*outsize;
8295 if (_PyBytes_Resize(outobj, requiredsize))
8296 return -1;
8297 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008298}
8299
Benjamin Peterson14339b62009-01-31 16:36:08 +00008300typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008301 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008302} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008303/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008304 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008305 space is available. Return a new reference to the object that
8306 was put in the output buffer, or Py_None, if the mapping was undefined
8307 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008308 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008309static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008310charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008311 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008312{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008313 PyObject *rep;
8314 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008315 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008316
Christian Heimes90aa7642007-12-19 02:45:37 +00008317 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008318 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008319 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008320 if (res == -1)
8321 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008322 if (outsize<requiredsize)
8323 if (charmapencode_resize(outobj, outpos, requiredsize))
8324 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008325 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008326 outstart[(*outpos)++] = (char)res;
8327 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008328 }
8329
8330 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008331 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008332 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008333 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008334 Py_DECREF(rep);
8335 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008336 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008337 if (PyLong_Check(rep)) {
8338 Py_ssize_t requiredsize = *outpos+1;
8339 if (outsize<requiredsize)
8340 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8341 Py_DECREF(rep);
8342 return enc_EXCEPTION;
8343 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008344 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008345 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008346 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008347 else {
8348 const char *repchars = PyBytes_AS_STRING(rep);
8349 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8350 Py_ssize_t requiredsize = *outpos+repsize;
8351 if (outsize<requiredsize)
8352 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8353 Py_DECREF(rep);
8354 return enc_EXCEPTION;
8355 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008356 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008357 memcpy(outstart + *outpos, repchars, repsize);
8358 *outpos += repsize;
8359 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008360 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008361 Py_DECREF(rep);
8362 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008363}
8364
8365/* handle an error in PyUnicode_EncodeCharmap
8366 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008367static int
8368charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008369 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008370 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008371 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008372 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008373{
8374 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008375 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008376 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008377 enum PyUnicode_Kind kind;
8378 void *data;
8379 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008380 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008381 Py_ssize_t collstartpos = *inpos;
8382 Py_ssize_t collendpos = *inpos+1;
8383 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008384 char *encoding = "charmap";
8385 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008386 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008387 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008388 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008389
Benjamin Petersonbac79492012-01-14 13:34:47 -05008390 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008391 return -1;
8392 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008393 /* find all unencodable characters */
8394 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008395 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008396 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008397 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008398 val = encoding_map_lookup(ch, mapping);
8399 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008400 break;
8401 ++collendpos;
8402 continue;
8403 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008404
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008405 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8406 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008407 if (rep==NULL)
8408 return -1;
8409 else if (rep!=Py_None) {
8410 Py_DECREF(rep);
8411 break;
8412 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008413 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008414 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008415 }
8416 /* cache callback name lookup
8417 * (if not done yet, i.e. it's the first error) */
8418 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008419 if ((errors==NULL) || (!strcmp(errors, "strict")))
8420 *known_errorHandler = 1;
8421 else if (!strcmp(errors, "replace"))
8422 *known_errorHandler = 2;
8423 else if (!strcmp(errors, "ignore"))
8424 *known_errorHandler = 3;
8425 else if (!strcmp(errors, "xmlcharrefreplace"))
8426 *known_errorHandler = 4;
8427 else
8428 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008429 }
8430 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008431 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008432 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008433 return -1;
8434 case 2: /* replace */
8435 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008436 x = charmapencode_output('?', mapping, res, respos);
8437 if (x==enc_EXCEPTION) {
8438 return -1;
8439 }
8440 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008441 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008442 return -1;
8443 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008444 }
8445 /* fall through */
8446 case 3: /* ignore */
8447 *inpos = collendpos;
8448 break;
8449 case 4: /* xmlcharrefreplace */
8450 /* generate replacement (temporarily (mis)uses p) */
8451 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008452 char buffer[2+29+1+1];
8453 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008454 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008455 for (cp = buffer; *cp; ++cp) {
8456 x = charmapencode_output(*cp, mapping, res, respos);
8457 if (x==enc_EXCEPTION)
8458 return -1;
8459 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008460 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008461 return -1;
8462 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008463 }
8464 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008465 *inpos = collendpos;
8466 break;
8467 default:
8468 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008469 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008470 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008471 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008472 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008473 if (PyBytes_Check(repunicode)) {
8474 /* Directly copy bytes result to output. */
8475 Py_ssize_t outsize = PyBytes_Size(*res);
8476 Py_ssize_t requiredsize;
8477 repsize = PyBytes_Size(repunicode);
8478 requiredsize = *respos + repsize;
8479 if (requiredsize > outsize)
8480 /* Make room for all additional bytes. */
8481 if (charmapencode_resize(res, respos, requiredsize)) {
8482 Py_DECREF(repunicode);
8483 return -1;
8484 }
8485 memcpy(PyBytes_AsString(*res) + *respos,
8486 PyBytes_AsString(repunicode), repsize);
8487 *respos += repsize;
8488 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008489 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008490 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008491 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008492 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008493 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008494 Py_DECREF(repunicode);
8495 return -1;
8496 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008497 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008498 data = PyUnicode_DATA(repunicode);
8499 kind = PyUnicode_KIND(repunicode);
8500 for (index = 0; index < repsize; index++) {
8501 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8502 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008503 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008504 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008505 return -1;
8506 }
8507 else if (x==enc_FAILED) {
8508 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008509 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008510 return -1;
8511 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008512 }
8513 *inpos = newpos;
8514 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008515 }
8516 return 0;
8517}
8518
Alexander Belopolsky40018472011-02-26 01:02:56 +00008519PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008520_PyUnicode_EncodeCharmap(PyObject *unicode,
8521 PyObject *mapping,
8522 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008523{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008524 /* output object */
8525 PyObject *res = NULL;
8526 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008527 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008528 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008529 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008530 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008531 PyObject *errorHandler = NULL;
8532 PyObject *exc = NULL;
8533 /* the following variable is used for caching string comparisons
8534 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8535 * 3=ignore, 4=xmlcharrefreplace */
8536 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008537
Benjamin Petersonbac79492012-01-14 13:34:47 -05008538 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008539 return NULL;
8540 size = PyUnicode_GET_LENGTH(unicode);
8541
Guido van Rossumd57fd912000-03-10 22:53:23 +00008542 /* Default to Latin-1 */
8543 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008544 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008545
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008546 /* allocate enough for a simple encoding without
8547 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008548 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008549 if (res == NULL)
8550 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008551 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008552 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008553
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008554 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008555 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008556 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008557 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008558 if (x==enc_EXCEPTION) /* error */
8559 goto onError;
8560 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008561 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008562 &exc,
8563 &known_errorHandler, &errorHandler, errors,
8564 &res, &respos)) {
8565 goto onError;
8566 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008567 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008568 else
8569 /* done with this character => adjust input position */
8570 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008571 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008572
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008573 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008574 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008575 if (_PyBytes_Resize(&res, respos) < 0)
8576 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008577
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008578 Py_XDECREF(exc);
8579 Py_XDECREF(errorHandler);
8580 return res;
8581
Benjamin Peterson29060642009-01-31 22:14:21 +00008582 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008583 Py_XDECREF(res);
8584 Py_XDECREF(exc);
8585 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008586 return NULL;
8587}
8588
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008589/* Deprecated */
8590PyObject *
8591PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8592 Py_ssize_t size,
8593 PyObject *mapping,
8594 const char *errors)
8595{
8596 PyObject *result;
8597 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8598 if (unicode == NULL)
8599 return NULL;
8600 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8601 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008602 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008603}
8604
Alexander Belopolsky40018472011-02-26 01:02:56 +00008605PyObject *
8606PyUnicode_AsCharmapString(PyObject *unicode,
8607 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008608{
8609 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008610 PyErr_BadArgument();
8611 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008612 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008613 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008614}
8615
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008616/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008617static void
8618make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008619 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008620 Py_ssize_t startpos, Py_ssize_t endpos,
8621 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008622{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008623 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008624 *exceptionObject = _PyUnicodeTranslateError_Create(
8625 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008626 }
8627 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008628 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8629 goto onError;
8630 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8631 goto onError;
8632 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8633 goto onError;
8634 return;
8635 onError:
8636 Py_DECREF(*exceptionObject);
8637 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008638 }
8639}
8640
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008641/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008642static void
8643raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008644 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008645 Py_ssize_t startpos, Py_ssize_t endpos,
8646 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008647{
8648 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008649 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008650 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008651 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008652}
8653
8654/* error handling callback helper:
8655 build arguments, call the callback and check the arguments,
8656 put the result into newpos and return the replacement string, which
8657 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008658static PyObject *
8659unicode_translate_call_errorhandler(const char *errors,
8660 PyObject **errorHandler,
8661 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008662 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008663 Py_ssize_t startpos, Py_ssize_t endpos,
8664 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008665{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008666 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008667
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008668 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008669 PyObject *restuple;
8670 PyObject *resunicode;
8671
8672 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008673 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008674 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008675 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008676 }
8677
8678 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008679 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008680 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008681 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008682
8683 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008684 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008685 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008686 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008687 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008688 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008689 Py_DECREF(restuple);
8690 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008691 }
8692 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008693 &resunicode, &i_newpos)) {
8694 Py_DECREF(restuple);
8695 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008696 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008697 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008698 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008699 else
8700 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008701 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008702 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8703 Py_DECREF(restuple);
8704 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008705 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008706 Py_INCREF(resunicode);
8707 Py_DECREF(restuple);
8708 return resunicode;
8709}
8710
8711/* Lookup the character ch in the mapping and put the result in result,
8712 which must be decrefed by the caller.
8713 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008714static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008715charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008716{
Christian Heimes217cfd12007-12-02 14:31:20 +00008717 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008718 PyObject *x;
8719
8720 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008721 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008722 x = PyObject_GetItem(mapping, w);
8723 Py_DECREF(w);
8724 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008725 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8726 /* No mapping found means: use 1:1 mapping. */
8727 PyErr_Clear();
8728 *result = NULL;
8729 return 0;
8730 } else
8731 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008732 }
8733 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008734 *result = x;
8735 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008736 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008737 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008738 long value = PyLong_AS_LONG(x);
8739 long max = PyUnicode_GetMax();
8740 if (value < 0 || value > max) {
8741 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008742 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008743 Py_DECREF(x);
8744 return -1;
8745 }
8746 *result = x;
8747 return 0;
8748 }
8749 else if (PyUnicode_Check(x)) {
8750 *result = x;
8751 return 0;
8752 }
8753 else {
8754 /* wrong return value */
8755 PyErr_SetString(PyExc_TypeError,
8756 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008757 Py_DECREF(x);
8758 return -1;
8759 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008760}
8761/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008762 if not reallocate and adjust various state variables.
8763 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008764static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008765charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008766 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008767{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008768 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00008769 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008770 /* exponentially overallocate to minimize reallocations */
8771 if (requiredsize < 2 * oldsize)
8772 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008773 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8774 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008775 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008776 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008777 }
8778 return 0;
8779}
8780/* lookup the character, put the result in the output string and adjust
8781 various state variables. Return a new reference to the object that
8782 was put in the output buffer in *result, or Py_None, if the mapping was
8783 undefined (in which case no character was written).
8784 The called must decref result.
8785 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008786static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008787charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8788 PyObject *mapping, Py_UCS4 **output,
8789 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008790 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008791{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008792 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8793 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008794 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008795 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008796 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008797 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008798 }
8799 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008800 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008801 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008802 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008803 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008804 }
8805 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008806 Py_ssize_t repsize;
8807 if (PyUnicode_READY(*res) == -1)
8808 return -1;
8809 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008810 if (repsize==1) {
8811 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008812 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008813 }
8814 else if (repsize!=0) {
8815 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008816 Py_ssize_t requiredsize = *opos +
8817 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008818 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008819 Py_ssize_t i;
8820 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008821 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008822 for(i = 0; i < repsize; i++)
8823 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008824 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008825 }
8826 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008827 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008828 return 0;
8829}
8830
Alexander Belopolsky40018472011-02-26 01:02:56 +00008831PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008832_PyUnicode_TranslateCharmap(PyObject *input,
8833 PyObject *mapping,
8834 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008835{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008836 /* input object */
8837 char *idata;
8838 Py_ssize_t size, i;
8839 int kind;
8840 /* output buffer */
8841 Py_UCS4 *output = NULL;
8842 Py_ssize_t osize;
8843 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008844 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008845 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008846 char *reason = "character maps to <undefined>";
8847 PyObject *errorHandler = NULL;
8848 PyObject *exc = NULL;
8849 /* the following variable is used for caching string comparisons
8850 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8851 * 3=ignore, 4=xmlcharrefreplace */
8852 int known_errorHandler = -1;
8853
Guido van Rossumd57fd912000-03-10 22:53:23 +00008854 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008855 PyErr_BadArgument();
8856 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008857 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008858
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008859 if (PyUnicode_READY(input) == -1)
8860 return NULL;
8861 idata = (char*)PyUnicode_DATA(input);
8862 kind = PyUnicode_KIND(input);
8863 size = PyUnicode_GET_LENGTH(input);
8864 i = 0;
8865
8866 if (size == 0) {
8867 Py_INCREF(input);
8868 return input;
8869 }
8870
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008871 /* allocate enough for a simple 1:1 translation without
8872 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008873 osize = size;
8874 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8875 opos = 0;
8876 if (output == NULL) {
8877 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008878 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008879 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008880
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008881 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008882 /* try to encode it */
8883 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008884 if (charmaptranslate_output(input, i, mapping,
8885 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008886 Py_XDECREF(x);
8887 goto onError;
8888 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008889 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008890 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008891 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008892 else { /* untranslatable character */
8893 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8894 Py_ssize_t repsize;
8895 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008896 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008897 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008898 Py_ssize_t collstart = i;
8899 Py_ssize_t collend = i+1;
8900 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008901
Benjamin Peterson29060642009-01-31 22:14:21 +00008902 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008903 while (collend < size) {
8904 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008905 goto onError;
8906 Py_XDECREF(x);
8907 if (x!=Py_None)
8908 break;
8909 ++collend;
8910 }
8911 /* cache callback name lookup
8912 * (if not done yet, i.e. it's the first error) */
8913 if (known_errorHandler==-1) {
8914 if ((errors==NULL) || (!strcmp(errors, "strict")))
8915 known_errorHandler = 1;
8916 else if (!strcmp(errors, "replace"))
8917 known_errorHandler = 2;
8918 else if (!strcmp(errors, "ignore"))
8919 known_errorHandler = 3;
8920 else if (!strcmp(errors, "xmlcharrefreplace"))
8921 known_errorHandler = 4;
8922 else
8923 known_errorHandler = 0;
8924 }
8925 switch (known_errorHandler) {
8926 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008927 raise_translate_exception(&exc, input, collstart,
8928 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008929 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008930 case 2: /* replace */
8931 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008932 for (coll = collstart; coll<collend; coll++)
8933 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008934 /* fall through */
8935 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008936 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008937 break;
8938 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008939 /* generate replacement (temporarily (mis)uses i) */
8940 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008941 char buffer[2+29+1+1];
8942 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008943 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8944 if (charmaptranslate_makespace(&output, &osize,
8945 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008946 goto onError;
8947 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008948 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008949 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008950 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008951 break;
8952 default:
8953 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008954 reason, input, &exc,
8955 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008956 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008957 goto onError;
Benjamin Peterson9ca3ffa2012-01-01 16:04:29 -06008958 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008959 Py_DECREF(repunicode);
8960 goto onError;
8961 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008962 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008963 repsize = PyUnicode_GET_LENGTH(repunicode);
8964 if (charmaptranslate_makespace(&output, &osize,
8965 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008966 Py_DECREF(repunicode);
8967 goto onError;
8968 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008969 for (uni2 = 0; repsize-->0; ++uni2)
8970 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8971 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008972 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008973 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008974 }
8975 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008976 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8977 if (!res)
8978 goto onError;
8979 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008980 Py_XDECREF(exc);
8981 Py_XDECREF(errorHandler);
8982 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008983
Benjamin Peterson29060642009-01-31 22:14:21 +00008984 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008985 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008986 Py_XDECREF(exc);
8987 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008988 return NULL;
8989}
8990
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008991/* Deprecated. Use PyUnicode_Translate instead. */
8992PyObject *
8993PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8994 Py_ssize_t size,
8995 PyObject *mapping,
8996 const char *errors)
8997{
8998 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8999 if (!unicode)
9000 return NULL;
9001 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9002}
9003
Alexander Belopolsky40018472011-02-26 01:02:56 +00009004PyObject *
9005PyUnicode_Translate(PyObject *str,
9006 PyObject *mapping,
9007 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009008{
9009 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00009010
Guido van Rossumd57fd912000-03-10 22:53:23 +00009011 str = PyUnicode_FromObject(str);
9012 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009013 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009014 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009015 Py_DECREF(str);
9016 return result;
Tim Petersced69f82003-09-16 20:30:58 +00009017
Benjamin Peterson29060642009-01-31 22:14:21 +00009018 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009019 Py_XDECREF(str);
9020 return NULL;
9021}
Tim Petersced69f82003-09-16 20:30:58 +00009022
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009023static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009024fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009025{
9026 /* No need to call PyUnicode_READY(self) because this function is only
9027 called as a callback from fixup() which does it already. */
9028 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9029 const int kind = PyUnicode_KIND(self);
9030 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02009031 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009032 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009033 Py_ssize_t i;
9034
9035 for (i = 0; i < len; ++i) {
9036 ch = PyUnicode_READ(kind, data, i);
9037 fixed = 0;
9038 if (ch > 127) {
9039 if (Py_UNICODE_ISSPACE(ch))
9040 fixed = ' ';
9041 else {
9042 const int decimal = Py_UNICODE_TODECIMAL(ch);
9043 if (decimal >= 0)
9044 fixed = '0' + decimal;
9045 }
9046 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009047 modified = 1;
Victor Stinnere6abb482012-05-02 01:15:40 +02009048 maxchar = MAX_MAXCHAR(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009049 PyUnicode_WRITE(kind, data, i, fixed);
9050 }
Victor Stinnere6abb482012-05-02 01:15:40 +02009051 else
9052 maxchar = MAX_MAXCHAR(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009053 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009054 }
9055
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009056 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009057}
9058
9059PyObject *
9060_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9061{
9062 if (!PyUnicode_Check(unicode)) {
9063 PyErr_BadInternalCall();
9064 return NULL;
9065 }
9066 if (PyUnicode_READY(unicode) == -1)
9067 return NULL;
9068 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
9069 /* If the string is already ASCII, just return the same string */
9070 Py_INCREF(unicode);
9071 return unicode;
9072 }
Victor Stinner9310abb2011-10-05 00:59:23 +02009073 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009074}
9075
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009076PyObject *
9077PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9078 Py_ssize_t length)
9079{
Victor Stinnerf0124502011-11-21 23:12:56 +01009080 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009081 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009082 Py_UCS4 maxchar;
9083 enum PyUnicode_Kind kind;
9084 void *data;
9085
Victor Stinner99d7ad02012-02-22 13:37:39 +01009086 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009087 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01009088 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009089 if (ch > 127) {
9090 int decimal = Py_UNICODE_TODECIMAL(ch);
9091 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009092 ch = '0' + decimal;
Victor Stinnere6abb482012-05-02 01:15:40 +02009093 maxchar = MAX_MAXCHAR(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009094 }
9095 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009096
9097 /* Copy to a new string */
9098 decimal = PyUnicode_New(length, maxchar);
9099 if (decimal == NULL)
9100 return decimal;
9101 kind = PyUnicode_KIND(decimal);
9102 data = PyUnicode_DATA(decimal);
9103 /* Iterate over code points */
9104 for (i = 0; i < length; i++) {
9105 Py_UNICODE ch = s[i];
9106 if (ch > 127) {
9107 int decimal = Py_UNICODE_TODECIMAL(ch);
9108 if (decimal >= 0)
9109 ch = '0' + decimal;
9110 }
9111 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009112 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009113 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009114}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009115/* --- Decimal Encoder ---------------------------------------------------- */
9116
Alexander Belopolsky40018472011-02-26 01:02:56 +00009117int
9118PyUnicode_EncodeDecimal(Py_UNICODE *s,
9119 Py_ssize_t length,
9120 char *output,
9121 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009122{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009123 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009124 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009125 enum PyUnicode_Kind kind;
9126 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009127
9128 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009129 PyErr_BadArgument();
9130 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009131 }
9132
Victor Stinner42bf7752011-11-21 22:52:58 +01009133 unicode = PyUnicode_FromUnicode(s, length);
9134 if (unicode == NULL)
9135 return -1;
9136
Benjamin Petersonbac79492012-01-14 13:34:47 -05009137 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01009138 Py_DECREF(unicode);
9139 return -1;
9140 }
Victor Stinner42bf7752011-11-21 22:52:58 +01009141 kind = PyUnicode_KIND(unicode);
9142 data = PyUnicode_DATA(unicode);
9143
Victor Stinnerb84d7232011-11-22 01:50:07 +01009144 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009145 PyObject *exc;
9146 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009147 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009148 Py_ssize_t startpos;
9149
9150 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009151
Benjamin Peterson29060642009-01-31 22:14:21 +00009152 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009153 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009154 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009155 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009156 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009157 decimal = Py_UNICODE_TODECIMAL(ch);
9158 if (decimal >= 0) {
9159 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009160 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009161 continue;
9162 }
9163 if (0 < ch && ch < 256) {
9164 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009165 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009166 continue;
9167 }
Victor Stinner6345be92011-11-25 20:09:01 +01009168
Victor Stinner42bf7752011-11-21 22:52:58 +01009169 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009170 exc = NULL;
9171 raise_encode_exception(&exc, "decimal", unicode,
9172 startpos, startpos+1,
9173 "invalid decimal Unicode string");
9174 Py_XDECREF(exc);
9175 Py_DECREF(unicode);
9176 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009177 }
9178 /* 0-terminate the output string */
9179 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009180 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009181 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009182}
9183
Guido van Rossumd57fd912000-03-10 22:53:23 +00009184/* --- Helpers ------------------------------------------------------------ */
9185
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009186static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02009187any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009188 Py_ssize_t start,
9189 Py_ssize_t end)
9190{
9191 int kind1, kind2, kind;
9192 void *buf1, *buf2;
9193 Py_ssize_t len1, len2, result;
9194
9195 kind1 = PyUnicode_KIND(s1);
9196 kind2 = PyUnicode_KIND(s2);
9197 kind = kind1 > kind2 ? kind1 : kind2;
9198 buf1 = PyUnicode_DATA(s1);
9199 buf2 = PyUnicode_DATA(s2);
9200 if (kind1 != kind)
9201 buf1 = _PyUnicode_AsKind(s1, kind);
9202 if (!buf1)
9203 return -2;
9204 if (kind2 != kind)
9205 buf2 = _PyUnicode_AsKind(s2, kind);
9206 if (!buf2) {
9207 if (kind1 != kind) PyMem_Free(buf1);
9208 return -2;
9209 }
9210 len1 = PyUnicode_GET_LENGTH(s1);
9211 len2 = PyUnicode_GET_LENGTH(s2);
9212
Victor Stinner794d5672011-10-10 03:21:36 +02009213 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06009214 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02009215 case PyUnicode_1BYTE_KIND:
9216 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9217 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9218 else
9219 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9220 break;
9221 case PyUnicode_2BYTE_KIND:
9222 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9223 break;
9224 case PyUnicode_4BYTE_KIND:
9225 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9226 break;
9227 default:
9228 assert(0); result = -2;
9229 }
9230 }
9231 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06009232 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02009233 case PyUnicode_1BYTE_KIND:
9234 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9235 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9236 else
9237 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9238 break;
9239 case PyUnicode_2BYTE_KIND:
9240 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9241 break;
9242 case PyUnicode_4BYTE_KIND:
9243 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9244 break;
9245 default:
9246 assert(0); result = -2;
9247 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009248 }
9249
9250 if (kind1 != kind)
9251 PyMem_Free(buf1);
9252 if (kind2 != kind)
9253 PyMem_Free(buf2);
9254
9255 return result;
9256}
9257
9258Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009259_PyUnicode_InsertThousandsGrouping(
9260 PyObject *unicode, Py_ssize_t index,
9261 Py_ssize_t n_buffer,
9262 void *digits, Py_ssize_t n_digits,
9263 Py_ssize_t min_width,
9264 const char *grouping, PyObject *thousands_sep,
9265 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009266{
Victor Stinner41a863c2012-02-24 00:37:51 +01009267 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009268 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009269 Py_ssize_t thousands_sep_len;
9270 Py_ssize_t len;
9271
9272 if (unicode != NULL) {
9273 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009274 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009275 }
9276 else {
9277 kind = PyUnicode_1BYTE_KIND;
9278 data = NULL;
9279 }
9280 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9281 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9282 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9283 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009284 if (thousands_sep_kind < kind) {
9285 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9286 if (!thousands_sep_data)
9287 return -1;
9288 }
9289 else {
9290 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9291 if (!data)
9292 return -1;
9293 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009294 }
9295
Benjamin Petersonead6b532011-12-20 17:23:42 -06009296 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009297 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009298 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009299 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009300 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009301 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009302 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009303 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009304 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009305 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009306 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009307 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009308 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009309 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009310 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009311 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009312 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009313 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009314 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009315 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009316 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009317 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009318 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009319 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009320 break;
9321 default:
9322 assert(0);
9323 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009324 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009325 if (unicode != NULL && thousands_sep_kind != kind) {
9326 if (thousands_sep_kind < kind)
9327 PyMem_Free(thousands_sep_data);
9328 else
9329 PyMem_Free(data);
9330 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009331 if (unicode == NULL) {
9332 *maxchar = 127;
9333 if (len != n_digits) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009334 *maxchar = MAX_MAXCHAR(*maxchar,
9335 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009336 }
9337 }
9338 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009339}
9340
9341
Thomas Wouters477c8d52006-05-27 19:21:47 +00009342/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009343#define ADJUST_INDICES(start, end, len) \
9344 if (end > len) \
9345 end = len; \
9346 else if (end < 0) { \
9347 end += len; \
9348 if (end < 0) \
9349 end = 0; \
9350 } \
9351 if (start < 0) { \
9352 start += len; \
9353 if (start < 0) \
9354 start = 0; \
9355 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009356
Alexander Belopolsky40018472011-02-26 01:02:56 +00009357Py_ssize_t
9358PyUnicode_Count(PyObject *str,
9359 PyObject *substr,
9360 Py_ssize_t start,
9361 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009362{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009363 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009364 PyObject* str_obj;
9365 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009366 int kind1, kind2, kind;
9367 void *buf1 = NULL, *buf2 = NULL;
9368 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009369
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009370 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009371 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00009372 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009373 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009374 if (!sub_obj) {
9375 Py_DECREF(str_obj);
9376 return -1;
9377 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06009378 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06009379 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00009380 Py_DECREF(str_obj);
9381 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009382 }
Tim Petersced69f82003-09-16 20:30:58 +00009383
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009384 kind1 = PyUnicode_KIND(str_obj);
9385 kind2 = PyUnicode_KIND(sub_obj);
9386 kind = kind1 > kind2 ? kind1 : kind2;
9387 buf1 = PyUnicode_DATA(str_obj);
9388 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009389 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009390 if (!buf1)
9391 goto onError;
9392 buf2 = PyUnicode_DATA(sub_obj);
9393 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009394 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009395 if (!buf2)
9396 goto onError;
9397 len1 = PyUnicode_GET_LENGTH(str_obj);
9398 len2 = PyUnicode_GET_LENGTH(sub_obj);
9399
9400 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06009401 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009402 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009403 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9404 result = asciilib_count(
9405 ((Py_UCS1*)buf1) + start, end - start,
9406 buf2, len2, PY_SSIZE_T_MAX
9407 );
9408 else
9409 result = ucs1lib_count(
9410 ((Py_UCS1*)buf1) + start, end - start,
9411 buf2, len2, PY_SSIZE_T_MAX
9412 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009413 break;
9414 case PyUnicode_2BYTE_KIND:
9415 result = ucs2lib_count(
9416 ((Py_UCS2*)buf1) + start, end - start,
9417 buf2, len2, PY_SSIZE_T_MAX
9418 );
9419 break;
9420 case PyUnicode_4BYTE_KIND:
9421 result = ucs4lib_count(
9422 ((Py_UCS4*)buf1) + start, end - start,
9423 buf2, len2, PY_SSIZE_T_MAX
9424 );
9425 break;
9426 default:
9427 assert(0); result = 0;
9428 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009429
9430 Py_DECREF(sub_obj);
9431 Py_DECREF(str_obj);
9432
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009433 if (kind1 != kind)
9434 PyMem_Free(buf1);
9435 if (kind2 != kind)
9436 PyMem_Free(buf2);
9437
Guido van Rossumd57fd912000-03-10 22:53:23 +00009438 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009439 onError:
9440 Py_DECREF(sub_obj);
9441 Py_DECREF(str_obj);
9442 if (kind1 != kind && buf1)
9443 PyMem_Free(buf1);
9444 if (kind2 != kind && buf2)
9445 PyMem_Free(buf2);
9446 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009447}
9448
Alexander Belopolsky40018472011-02-26 01:02:56 +00009449Py_ssize_t
9450PyUnicode_Find(PyObject *str,
9451 PyObject *sub,
9452 Py_ssize_t start,
9453 Py_ssize_t end,
9454 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009455{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009456 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009457
Guido van Rossumd57fd912000-03-10 22:53:23 +00009458 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009459 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009460 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009461 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009462 if (!sub) {
9463 Py_DECREF(str);
9464 return -2;
9465 }
9466 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9467 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009468 Py_DECREF(str);
9469 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009470 }
Tim Petersced69f82003-09-16 20:30:58 +00009471
Victor Stinner794d5672011-10-10 03:21:36 +02009472 result = any_find_slice(direction,
9473 str, sub, start, end
9474 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009475
Guido van Rossumd57fd912000-03-10 22:53:23 +00009476 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009477 Py_DECREF(sub);
9478
Guido van Rossumd57fd912000-03-10 22:53:23 +00009479 return result;
9480}
9481
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009482Py_ssize_t
9483PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9484 Py_ssize_t start, Py_ssize_t end,
9485 int direction)
9486{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009487 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009488 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009489 if (PyUnicode_READY(str) == -1)
9490 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009491 if (start < 0 || end < 0) {
9492 PyErr_SetString(PyExc_IndexError, "string index out of range");
9493 return -2;
9494 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009495 if (end > PyUnicode_GET_LENGTH(str))
9496 end = PyUnicode_GET_LENGTH(str);
9497 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009498 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9499 kind, end-start, ch, direction);
9500 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009501 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009502 else
9503 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009504}
9505
Alexander Belopolsky40018472011-02-26 01:02:56 +00009506static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009507tailmatch(PyObject *self,
9508 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009509 Py_ssize_t start,
9510 Py_ssize_t end,
9511 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009512{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009513 int kind_self;
9514 int kind_sub;
9515 void *data_self;
9516 void *data_sub;
9517 Py_ssize_t offset;
9518 Py_ssize_t i;
9519 Py_ssize_t end_sub;
9520
9521 if (PyUnicode_READY(self) == -1 ||
9522 PyUnicode_READY(substring) == -1)
9523 return 0;
9524
9525 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009526 return 1;
9527
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009528 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9529 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009530 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009531 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009532
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009533 kind_self = PyUnicode_KIND(self);
9534 data_self = PyUnicode_DATA(self);
9535 kind_sub = PyUnicode_KIND(substring);
9536 data_sub = PyUnicode_DATA(substring);
9537 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9538
9539 if (direction > 0)
9540 offset = end;
9541 else
9542 offset = start;
9543
9544 if (PyUnicode_READ(kind_self, data_self, offset) ==
9545 PyUnicode_READ(kind_sub, data_sub, 0) &&
9546 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9547 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9548 /* If both are of the same kind, memcmp is sufficient */
9549 if (kind_self == kind_sub) {
9550 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009551 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009552 data_sub,
9553 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009554 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009555 }
9556 /* otherwise we have to compare each character by first accesing it */
9557 else {
9558 /* We do not need to compare 0 and len(substring)-1 because
9559 the if statement above ensured already that they are equal
9560 when we end up here. */
9561 // TODO: honor direction and do a forward or backwards search
9562 for (i = 1; i < end_sub; ++i) {
9563 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9564 PyUnicode_READ(kind_sub, data_sub, i))
9565 return 0;
9566 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009567 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009568 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009569 }
9570
9571 return 0;
9572}
9573
Alexander Belopolsky40018472011-02-26 01:02:56 +00009574Py_ssize_t
9575PyUnicode_Tailmatch(PyObject *str,
9576 PyObject *substr,
9577 Py_ssize_t start,
9578 Py_ssize_t end,
9579 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009580{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009581 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009582
Guido van Rossumd57fd912000-03-10 22:53:23 +00009583 str = PyUnicode_FromObject(str);
9584 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009585 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009586 substr = PyUnicode_FromObject(substr);
9587 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009588 Py_DECREF(str);
9589 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009590 }
Tim Petersced69f82003-09-16 20:30:58 +00009591
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009592 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009593 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009594 Py_DECREF(str);
9595 Py_DECREF(substr);
9596 return result;
9597}
9598
Guido van Rossumd57fd912000-03-10 22:53:23 +00009599/* Apply fixfct filter to the Unicode object self and return a
9600 reference to the modified object */
9601
Alexander Belopolsky40018472011-02-26 01:02:56 +00009602static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009603fixup(PyObject *self,
9604 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009605{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009606 PyObject *u;
9607 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009608 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009609
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009610 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009611 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009612 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009613 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009614
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009615 /* fix functions return the new maximum character in a string,
9616 if the kind of the resulting unicode object does not change,
9617 everything is fine. Otherwise we need to change the string kind
9618 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009619 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009620
9621 if (maxchar_new == 0) {
9622 /* no changes */;
9623 if (PyUnicode_CheckExact(self)) {
9624 Py_DECREF(u);
9625 Py_INCREF(self);
9626 return self;
9627 }
9628 else
9629 return u;
9630 }
9631
Victor Stinnere6abb482012-05-02 01:15:40 +02009632 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009633
Victor Stinnereaab6042011-12-11 22:22:39 +01009634 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009635 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009636
9637 /* In case the maximum character changed, we need to
9638 convert the string to the new category. */
9639 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9640 if (v == NULL) {
9641 Py_DECREF(u);
9642 return NULL;
9643 }
9644 if (maxchar_new > maxchar_old) {
9645 /* If the maxchar increased so that the kind changed, not all
9646 characters are representable anymore and we need to fix the
9647 string again. This only happens in very few cases. */
9648 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
9649 maxchar_old = fixfct(v);
9650 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009651 }
9652 else {
Victor Stinnereaab6042011-12-11 22:22:39 +01009653 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009654 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009655 Py_DECREF(u);
9656 assert(_PyUnicode_CheckConsistency(v, 1));
9657 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009658}
9659
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009660static PyObject *
9661ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009662{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009663 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9664 char *resdata, *data = PyUnicode_DATA(self);
9665 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009666
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009667 res = PyUnicode_New(len, 127);
9668 if (res == NULL)
9669 return NULL;
9670 resdata = PyUnicode_DATA(res);
9671 if (lower)
9672 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009673 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009674 _Py_bytes_upper(resdata, data, len);
9675 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009676}
9677
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009678static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009679handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009680{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009681 Py_ssize_t j;
9682 int final_sigma;
9683 Py_UCS4 c;
9684 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009685
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009686 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9687
9688 where ! is a negation and \p{xxx} is a character with property xxx.
9689 */
9690 for (j = i - 1; j >= 0; j--) {
9691 c = PyUnicode_READ(kind, data, j);
9692 if (!_PyUnicode_IsCaseIgnorable(c))
9693 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009694 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009695 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9696 if (final_sigma) {
9697 for (j = i + 1; j < length; j++) {
9698 c = PyUnicode_READ(kind, data, j);
9699 if (!_PyUnicode_IsCaseIgnorable(c))
9700 break;
9701 }
9702 final_sigma = j == length || !_PyUnicode_IsCased(c);
9703 }
9704 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009705}
9706
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009707static int
9708lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9709 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009710{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009711 /* Obscure special case. */
9712 if (c == 0x3A3) {
9713 mapped[0] = handle_capital_sigma(kind, data, length, i);
9714 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009715 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009716 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009717}
9718
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009719static Py_ssize_t
9720do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009721{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009722 Py_ssize_t i, k = 0;
9723 int n_res, j;
9724 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009725
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009726 c = PyUnicode_READ(kind, data, 0);
9727 n_res = _PyUnicode_ToUpperFull(c, mapped);
9728 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009729 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009730 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009731 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009732 for (i = 1; i < length; i++) {
9733 c = PyUnicode_READ(kind, data, i);
9734 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9735 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009736 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009737 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009738 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009739 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009740 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009741}
9742
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009743static Py_ssize_t
9744do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9745 Py_ssize_t i, k = 0;
9746
9747 for (i = 0; i < length; i++) {
9748 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9749 int n_res, j;
9750 if (Py_UNICODE_ISUPPER(c)) {
9751 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9752 }
9753 else if (Py_UNICODE_ISLOWER(c)) {
9754 n_res = _PyUnicode_ToUpperFull(c, mapped);
9755 }
9756 else {
9757 n_res = 1;
9758 mapped[0] = c;
9759 }
9760 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009761 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009762 res[k++] = mapped[j];
9763 }
9764 }
9765 return k;
9766}
9767
9768static Py_ssize_t
9769do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9770 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009771{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009772 Py_ssize_t i, k = 0;
9773
9774 for (i = 0; i < length; i++) {
9775 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9776 int n_res, j;
9777 if (lower)
9778 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9779 else
9780 n_res = _PyUnicode_ToUpperFull(c, mapped);
9781 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009782 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009783 res[k++] = mapped[j];
9784 }
9785 }
9786 return k;
9787}
9788
9789static Py_ssize_t
9790do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9791{
9792 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9793}
9794
9795static Py_ssize_t
9796do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9797{
9798 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9799}
9800
Benjamin Petersone51757f2012-01-12 21:10:29 -05009801static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009802do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9803{
9804 Py_ssize_t i, k = 0;
9805
9806 for (i = 0; i < length; i++) {
9807 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9808 Py_UCS4 mapped[3];
9809 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9810 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009811 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009812 res[k++] = mapped[j];
9813 }
9814 }
9815 return k;
9816}
9817
9818static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009819do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9820{
9821 Py_ssize_t i, k = 0;
9822 int previous_is_cased;
9823
9824 previous_is_cased = 0;
9825 for (i = 0; i < length; i++) {
9826 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9827 Py_UCS4 mapped[3];
9828 int n_res, j;
9829
9830 if (previous_is_cased)
9831 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9832 else
9833 n_res = _PyUnicode_ToTitleFull(c, mapped);
9834
9835 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009836 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009837 res[k++] = mapped[j];
9838 }
9839
9840 previous_is_cased = _PyUnicode_IsCased(c);
9841 }
9842 return k;
9843}
9844
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009845static PyObject *
9846case_operation(PyObject *self,
9847 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9848{
9849 PyObject *res = NULL;
9850 Py_ssize_t length, newlength = 0;
9851 int kind, outkind;
9852 void *data, *outdata;
9853 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9854
Benjamin Petersoneea48462012-01-16 14:28:50 -05009855 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009856
9857 kind = PyUnicode_KIND(self);
9858 data = PyUnicode_DATA(self);
9859 length = PyUnicode_GET_LENGTH(self);
9860 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9861 if (tmp == NULL)
9862 return PyErr_NoMemory();
9863 newlength = perform(kind, data, length, tmp, &maxchar);
9864 res = PyUnicode_New(newlength, maxchar);
9865 if (res == NULL)
9866 goto leave;
9867 tmpend = tmp + newlength;
9868 outdata = PyUnicode_DATA(res);
9869 outkind = PyUnicode_KIND(res);
9870 switch (outkind) {
9871 case PyUnicode_1BYTE_KIND:
9872 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9873 break;
9874 case PyUnicode_2BYTE_KIND:
9875 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9876 break;
9877 case PyUnicode_4BYTE_KIND:
9878 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9879 break;
9880 default:
9881 assert(0);
9882 break;
9883 }
9884 leave:
9885 PyMem_FREE(tmp);
9886 return res;
9887}
9888
Tim Peters8ce9f162004-08-27 01:49:32 +00009889PyObject *
9890PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009891{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009892 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009893 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009894 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009895 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009896 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9897 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009898 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009899 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009900 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009901 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009902 int use_memcpy;
9903 unsigned char *res_data = NULL, *sep_data = NULL;
9904 PyObject *last_obj;
9905 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009906
Tim Peters05eba1f2004-08-27 21:32:02 +00009907 fseq = PySequence_Fast(seq, "");
9908 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009909 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009910 }
9911
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009912 /* NOTE: the following code can't call back into Python code,
9913 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009914 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009915
Tim Peters05eba1f2004-08-27 21:32:02 +00009916 seqlen = PySequence_Fast_GET_SIZE(fseq);
9917 /* If empty sequence, return u"". */
9918 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009919 Py_DECREF(fseq);
9920 Py_INCREF(unicode_empty);
9921 res = unicode_empty;
9922 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009923 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009924
Tim Peters05eba1f2004-08-27 21:32:02 +00009925 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009926 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009927 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009928 if (seqlen == 1) {
9929 if (PyUnicode_CheckExact(items[0])) {
9930 res = items[0];
9931 Py_INCREF(res);
9932 Py_DECREF(fseq);
9933 return res;
9934 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009935 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009936 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009937 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009938 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009939 /* Set up sep and seplen */
9940 if (separator == NULL) {
9941 /* fall back to a blank space separator */
9942 sep = PyUnicode_FromOrdinal(' ');
9943 if (!sep)
9944 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009945 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009946 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009947 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009948 else {
9949 if (!PyUnicode_Check(separator)) {
9950 PyErr_Format(PyExc_TypeError,
9951 "separator: expected str instance,"
9952 " %.80s found",
9953 Py_TYPE(separator)->tp_name);
9954 goto onError;
9955 }
9956 if (PyUnicode_READY(separator))
9957 goto onError;
9958 sep = separator;
9959 seplen = PyUnicode_GET_LENGTH(separator);
9960 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9961 /* inc refcount to keep this code path symmetric with the
9962 above case of a blank separator */
9963 Py_INCREF(sep);
9964 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009965 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009966 }
9967
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009968 /* There are at least two things to join, or else we have a subclass
9969 * of str in the sequence.
9970 * Do a pre-pass to figure out the total amount of space we'll
9971 * need (sz), and see whether all argument are strings.
9972 */
9973 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009974#ifdef Py_DEBUG
9975 use_memcpy = 0;
9976#else
9977 use_memcpy = 1;
9978#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009979 for (i = 0; i < seqlen; i++) {
9980 const Py_ssize_t old_sz = sz;
9981 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009982 if (!PyUnicode_Check(item)) {
9983 PyErr_Format(PyExc_TypeError,
9984 "sequence item %zd: expected str instance,"
9985 " %.80s found",
9986 i, Py_TYPE(item)->tp_name);
9987 goto onError;
9988 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009989 if (PyUnicode_READY(item) == -1)
9990 goto onError;
9991 sz += PyUnicode_GET_LENGTH(item);
9992 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnere6abb482012-05-02 01:15:40 +02009993 maxchar = MAX_MAXCHAR(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009994 if (i != 0)
9995 sz += seplen;
9996 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9997 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009998 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009999 goto onError;
10000 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010001 if (use_memcpy && last_obj != NULL) {
10002 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10003 use_memcpy = 0;
10004 }
10005 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010006 }
Tim Petersced69f82003-09-16 20:30:58 +000010007
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010008 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010009 if (res == NULL)
10010 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010011
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010012 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010013#ifdef Py_DEBUG
10014 use_memcpy = 0;
10015#else
10016 if (use_memcpy) {
10017 res_data = PyUnicode_1BYTE_DATA(res);
10018 kind = PyUnicode_KIND(res);
10019 if (seplen != 0)
10020 sep_data = PyUnicode_1BYTE_DATA(sep);
10021 }
10022#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010023 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010024 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010025 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010026 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +020010027 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +020010028 if (use_memcpy) {
10029 Py_MEMCPY(res_data,
10030 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010031 kind * seplen);
10032 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010033 }
10034 else {
10035 copy_characters(res, res_offset, sep, 0, seplen);
10036 res_offset += seplen;
10037 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010038 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010039 itemlen = PyUnicode_GET_LENGTH(item);
10040 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +020010041 if (use_memcpy) {
10042 Py_MEMCPY(res_data,
10043 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010044 kind * itemlen);
10045 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010046 }
10047 else {
10048 copy_characters(res, res_offset, item, 0, itemlen);
10049 res_offset += itemlen;
10050 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010051 }
Tim Peters05eba1f2004-08-27 21:32:02 +000010052 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010053 if (use_memcpy)
10054 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010055 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +020010056 else
10057 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +000010058
Tim Peters05eba1f2004-08-27 21:32:02 +000010059 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010060 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010061 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010062 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010063
Benjamin Peterson29060642009-01-31 22:14:21 +000010064 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +000010065 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010066 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010067 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010068 return NULL;
10069}
10070
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010071#define FILL(kind, data, value, start, length) \
10072 do { \
10073 Py_ssize_t i_ = 0; \
10074 assert(kind != PyUnicode_WCHAR_KIND); \
10075 switch ((kind)) { \
10076 case PyUnicode_1BYTE_KIND: { \
10077 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020010078 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010079 break; \
10080 } \
10081 case PyUnicode_2BYTE_KIND: { \
10082 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
10083 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10084 break; \
10085 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010086 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010087 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
10088 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10089 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010090 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010091 } \
10092 } \
10093 } while (0)
10094
Victor Stinner3fe55312012-01-04 00:33:50 +010010095Py_ssize_t
10096PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10097 Py_UCS4 fill_char)
10098{
10099 Py_ssize_t maxlen;
10100 enum PyUnicode_Kind kind;
10101 void *data;
10102
10103 if (!PyUnicode_Check(unicode)) {
10104 PyErr_BadInternalCall();
10105 return -1;
10106 }
10107 if (PyUnicode_READY(unicode) == -1)
10108 return -1;
10109 if (unicode_check_modifiable(unicode))
10110 return -1;
10111
10112 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10113 PyErr_SetString(PyExc_ValueError,
10114 "fill character is bigger than "
10115 "the string maximum character");
10116 return -1;
10117 }
10118
10119 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10120 length = Py_MIN(maxlen, length);
10121 if (length <= 0)
10122 return 0;
10123
10124 kind = PyUnicode_KIND(unicode);
10125 data = PyUnicode_DATA(unicode);
10126 FILL(kind, data, fill_char, start, length);
10127 return length;
10128}
10129
Victor Stinner9310abb2011-10-05 00:59:23 +020010130static PyObject *
10131pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010132 Py_ssize_t left,
10133 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010134 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010135{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010136 PyObject *u;
10137 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010138 int kind;
10139 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010140
10141 if (left < 0)
10142 left = 0;
10143 if (right < 0)
10144 right = 0;
10145
Victor Stinnerc4b49542011-12-11 22:44:26 +010010146 if (left == 0 && right == 0)
10147 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010148
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010149 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10150 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010151 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10152 return NULL;
10153 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010154 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Victor Stinnere6abb482012-05-02 01:15:40 +020010155 maxchar = MAX_MAXCHAR(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010156 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010157 if (!u)
10158 return NULL;
10159
10160 kind = PyUnicode_KIND(u);
10161 data = PyUnicode_DATA(u);
10162 if (left)
10163 FILL(kind, data, fill, 0, left);
10164 if (right)
10165 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010166 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010167 assert(_PyUnicode_CheckConsistency(u, 1));
10168 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010169}
10170
Alexander Belopolsky40018472011-02-26 01:02:56 +000010171PyObject *
10172PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010173{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010174 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010175
10176 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010177 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010178 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060010179 if (PyUnicode_READY(string) == -1) {
10180 Py_DECREF(string);
10181 return NULL;
10182 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010183
Benjamin Petersonead6b532011-12-20 17:23:42 -060010184 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010185 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010186 if (PyUnicode_IS_ASCII(string))
10187 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010188 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010189 PyUnicode_GET_LENGTH(string), keepends);
10190 else
10191 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010192 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010193 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010194 break;
10195 case PyUnicode_2BYTE_KIND:
10196 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010197 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010198 PyUnicode_GET_LENGTH(string), keepends);
10199 break;
10200 case PyUnicode_4BYTE_KIND:
10201 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010202 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010203 PyUnicode_GET_LENGTH(string), keepends);
10204 break;
10205 default:
10206 assert(0);
10207 list = 0;
10208 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010209 Py_DECREF(string);
10210 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010211}
10212
Alexander Belopolsky40018472011-02-26 01:02:56 +000010213static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010214split(PyObject *self,
10215 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010216 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010217{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010218 int kind1, kind2, kind;
10219 void *buf1, *buf2;
10220 Py_ssize_t len1, len2;
10221 PyObject* out;
10222
Guido van Rossumd57fd912000-03-10 22:53:23 +000010223 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010224 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010225
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010226 if (PyUnicode_READY(self) == -1)
10227 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010228
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010229 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010230 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010231 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010232 if (PyUnicode_IS_ASCII(self))
10233 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010234 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010235 PyUnicode_GET_LENGTH(self), maxcount
10236 );
10237 else
10238 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010239 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010240 PyUnicode_GET_LENGTH(self), maxcount
10241 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010242 case PyUnicode_2BYTE_KIND:
10243 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010244 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010245 PyUnicode_GET_LENGTH(self), maxcount
10246 );
10247 case PyUnicode_4BYTE_KIND:
10248 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010249 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010250 PyUnicode_GET_LENGTH(self), maxcount
10251 );
10252 default:
10253 assert(0);
10254 return NULL;
10255 }
10256
10257 if (PyUnicode_READY(substring) == -1)
10258 return NULL;
10259
10260 kind1 = PyUnicode_KIND(self);
10261 kind2 = PyUnicode_KIND(substring);
10262 kind = kind1 > kind2 ? kind1 : kind2;
10263 buf1 = PyUnicode_DATA(self);
10264 buf2 = PyUnicode_DATA(substring);
10265 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010266 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010267 if (!buf1)
10268 return NULL;
10269 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010270 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010271 if (!buf2) {
10272 if (kind1 != kind) PyMem_Free(buf1);
10273 return NULL;
10274 }
10275 len1 = PyUnicode_GET_LENGTH(self);
10276 len2 = PyUnicode_GET_LENGTH(substring);
10277
Benjamin Petersonead6b532011-12-20 17:23:42 -060010278 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010279 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010280 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10281 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010282 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010283 else
10284 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010285 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010286 break;
10287 case PyUnicode_2BYTE_KIND:
10288 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010289 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010290 break;
10291 case PyUnicode_4BYTE_KIND:
10292 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010293 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010294 break;
10295 default:
10296 out = NULL;
10297 }
10298 if (kind1 != kind)
10299 PyMem_Free(buf1);
10300 if (kind2 != kind)
10301 PyMem_Free(buf2);
10302 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010303}
10304
Alexander Belopolsky40018472011-02-26 01:02:56 +000010305static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010306rsplit(PyObject *self,
10307 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010308 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010309{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010310 int kind1, kind2, kind;
10311 void *buf1, *buf2;
10312 Py_ssize_t len1, len2;
10313 PyObject* out;
10314
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010315 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010316 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010317
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010318 if (PyUnicode_READY(self) == -1)
10319 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010320
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010321 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010322 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010323 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010324 if (PyUnicode_IS_ASCII(self))
10325 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010326 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010327 PyUnicode_GET_LENGTH(self), maxcount
10328 );
10329 else
10330 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010331 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010332 PyUnicode_GET_LENGTH(self), maxcount
10333 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010334 case PyUnicode_2BYTE_KIND:
10335 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010336 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010337 PyUnicode_GET_LENGTH(self), maxcount
10338 );
10339 case PyUnicode_4BYTE_KIND:
10340 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010341 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010342 PyUnicode_GET_LENGTH(self), maxcount
10343 );
10344 default:
10345 assert(0);
10346 return NULL;
10347 }
10348
10349 if (PyUnicode_READY(substring) == -1)
10350 return NULL;
10351
10352 kind1 = PyUnicode_KIND(self);
10353 kind2 = PyUnicode_KIND(substring);
10354 kind = kind1 > kind2 ? kind1 : kind2;
10355 buf1 = PyUnicode_DATA(self);
10356 buf2 = PyUnicode_DATA(substring);
10357 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010358 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010359 if (!buf1)
10360 return NULL;
10361 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010362 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010363 if (!buf2) {
10364 if (kind1 != kind) PyMem_Free(buf1);
10365 return NULL;
10366 }
10367 len1 = PyUnicode_GET_LENGTH(self);
10368 len2 = PyUnicode_GET_LENGTH(substring);
10369
Benjamin Petersonead6b532011-12-20 17:23:42 -060010370 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010371 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010372 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10373 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010374 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010375 else
10376 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010377 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010378 break;
10379 case PyUnicode_2BYTE_KIND:
10380 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010381 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010382 break;
10383 case PyUnicode_4BYTE_KIND:
10384 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010385 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010386 break;
10387 default:
10388 out = NULL;
10389 }
10390 if (kind1 != kind)
10391 PyMem_Free(buf1);
10392 if (kind2 != kind)
10393 PyMem_Free(buf2);
10394 return out;
10395}
10396
10397static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010398anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10399 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010400{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010401 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010402 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010403 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10404 return asciilib_find(buf1, len1, buf2, len2, offset);
10405 else
10406 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010407 case PyUnicode_2BYTE_KIND:
10408 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10409 case PyUnicode_4BYTE_KIND:
10410 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10411 }
10412 assert(0);
10413 return -1;
10414}
10415
10416static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010417anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10418 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010419{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010420 switch (kind) {
10421 case PyUnicode_1BYTE_KIND:
10422 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10423 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10424 else
10425 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10426 case PyUnicode_2BYTE_KIND:
10427 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10428 case PyUnicode_4BYTE_KIND:
10429 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10430 }
10431 assert(0);
10432 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010433}
10434
Alexander Belopolsky40018472011-02-26 01:02:56 +000010435static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010436replace(PyObject *self, PyObject *str1,
10437 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010438{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010439 PyObject *u;
10440 char *sbuf = PyUnicode_DATA(self);
10441 char *buf1 = PyUnicode_DATA(str1);
10442 char *buf2 = PyUnicode_DATA(str2);
10443 int srelease = 0, release1 = 0, release2 = 0;
10444 int skind = PyUnicode_KIND(self);
10445 int kind1 = PyUnicode_KIND(str1);
10446 int kind2 = PyUnicode_KIND(str2);
10447 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10448 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10449 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010450 int mayshrink;
10451 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010452
10453 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010454 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010455 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010456 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010457
Victor Stinner59de0ee2011-10-07 10:01:28 +020010458 if (str1 == str2)
10459 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010460 if (skind < kind1)
10461 /* substring too wide to be present */
10462 goto nothing;
10463
Victor Stinner49a0a212011-10-12 23:46:10 +020010464 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10465 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10466 /* Replacing str1 with str2 may cause a maxchar reduction in the
10467 result string. */
10468 mayshrink = (maxchar_str2 < maxchar);
Victor Stinnere6abb482012-05-02 01:15:40 +020010469 maxchar = MAX_MAXCHAR(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010470
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010471 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010472 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010473 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010474 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010475 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010476 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010477 Py_UCS4 u1, u2;
10478 int rkind;
Victor Stinnerf6441102011-12-18 02:43:08 +010010479 Py_ssize_t index, pos;
10480 char *src;
10481
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010482 u1 = PyUnicode_READ_CHAR(str1, 0);
Victor Stinnerf6441102011-12-18 02:43:08 +010010483 pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1);
10484 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010485 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010486 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010487 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010488 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010489 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010490 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010491 rkind = PyUnicode_KIND(u);
Victor Stinnerf6441102011-12-18 02:43:08 +010010492
10493 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2);
10494 index = 0;
10495 src = sbuf;
10496 while (--maxcount)
10497 {
10498 pos++;
10499 src += pos * PyUnicode_KIND(self);
10500 slen -= pos;
10501 index += pos;
10502 pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1);
10503 if (pos < 0)
10504 break;
10505 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2);
10506 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010507 }
10508 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010509 int rkind = skind;
10510 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010511 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010512
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010513 if (kind1 < rkind) {
10514 /* widen substring */
10515 buf1 = _PyUnicode_AsKind(str1, rkind);
10516 if (!buf1) goto error;
10517 release1 = 1;
10518 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010519 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010520 if (i < 0)
10521 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010522 if (rkind > kind2) {
10523 /* widen replacement */
10524 buf2 = _PyUnicode_AsKind(str2, rkind);
10525 if (!buf2) goto error;
10526 release2 = 1;
10527 }
10528 else if (rkind < kind2) {
10529 /* widen self and buf1 */
10530 rkind = kind2;
10531 if (release1) PyMem_Free(buf1);
10532 sbuf = _PyUnicode_AsKind(self, rkind);
10533 if (!sbuf) goto error;
10534 srelease = 1;
10535 buf1 = _PyUnicode_AsKind(str1, rkind);
10536 if (!buf1) goto error;
10537 release1 = 1;
10538 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010539 u = PyUnicode_New(slen, maxchar);
10540 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010541 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010542 assert(PyUnicode_KIND(u) == rkind);
10543 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010544
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010545 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010546 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010547 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010548 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010549 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010550 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010551
10552 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010553 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010554 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010555 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010556 if (i == -1)
10557 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010558 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010559 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010560 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010561 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010562 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010563 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010564 }
10565 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010566 Py_ssize_t n, i, j, ires;
10567 Py_ssize_t product, new_size;
10568 int rkind = skind;
10569 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010570
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010571 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010572 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010573 buf1 = _PyUnicode_AsKind(str1, rkind);
10574 if (!buf1) goto error;
10575 release1 = 1;
10576 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010577 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010578 if (n == 0)
10579 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010580 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010581 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010582 buf2 = _PyUnicode_AsKind(str2, rkind);
10583 if (!buf2) goto error;
10584 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010585 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010586 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010587 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010588 rkind = kind2;
10589 sbuf = _PyUnicode_AsKind(self, rkind);
10590 if (!sbuf) goto error;
10591 srelease = 1;
10592 if (release1) PyMem_Free(buf1);
10593 buf1 = _PyUnicode_AsKind(str1, rkind);
10594 if (!buf1) goto error;
10595 release1 = 1;
10596 }
10597 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10598 PyUnicode_GET_LENGTH(str1))); */
10599 product = n * (len2-len1);
10600 if ((product / (len2-len1)) != n) {
10601 PyErr_SetString(PyExc_OverflowError,
10602 "replace string is too long");
10603 goto error;
10604 }
10605 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +020010606 if (new_size == 0) {
10607 Py_INCREF(unicode_empty);
10608 u = unicode_empty;
10609 goto done;
10610 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010611 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10612 PyErr_SetString(PyExc_OverflowError,
10613 "replace string is too long");
10614 goto error;
10615 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010616 u = PyUnicode_New(new_size, maxchar);
10617 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010618 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010619 assert(PyUnicode_KIND(u) == rkind);
10620 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010621 ires = i = 0;
10622 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010623 while (n-- > 0) {
10624 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010625 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010626 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010627 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010628 if (j == -1)
10629 break;
10630 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010631 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010632 memcpy(res + rkind * ires,
10633 sbuf + rkind * i,
10634 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010635 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010636 }
10637 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010638 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010639 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010640 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010641 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010642 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010643 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010644 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010645 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010646 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010647 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010648 memcpy(res + rkind * ires,
10649 sbuf + rkind * i,
10650 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010651 }
10652 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010653 /* interleave */
10654 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010655 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010656 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010657 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010658 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010659 if (--n <= 0)
10660 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010661 memcpy(res + rkind * ires,
10662 sbuf + rkind * i,
10663 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010664 ires++;
10665 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010666 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010667 memcpy(res + rkind * ires,
10668 sbuf + rkind * i,
10669 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010670 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010671 }
10672
10673 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010674 unicode_adjust_maxchar(&u);
10675 if (u == NULL)
10676 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010677 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010678
10679 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010680 if (srelease)
10681 PyMem_FREE(sbuf);
10682 if (release1)
10683 PyMem_FREE(buf1);
10684 if (release2)
10685 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010686 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010687 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010688
Benjamin Peterson29060642009-01-31 22:14:21 +000010689 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010690 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010691 if (srelease)
10692 PyMem_FREE(sbuf);
10693 if (release1)
10694 PyMem_FREE(buf1);
10695 if (release2)
10696 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010697 return unicode_result_unchanged(self);
10698
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010699 error:
10700 if (srelease && sbuf)
10701 PyMem_FREE(sbuf);
10702 if (release1 && buf1)
10703 PyMem_FREE(buf1);
10704 if (release2 && buf2)
10705 PyMem_FREE(buf2);
10706 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010707}
10708
10709/* --- Unicode Object Methods --------------------------------------------- */
10710
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010711PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010712 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010713\n\
10714Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010715characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010716
10717static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010718unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010719{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010720 if (PyUnicode_READY(self) == -1)
10721 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010722 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010723}
10724
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010725PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010726 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010727\n\
10728Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010729have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010730
10731static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010732unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010733{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010734 if (PyUnicode_READY(self) == -1)
10735 return NULL;
10736 if (PyUnicode_GET_LENGTH(self) == 0)
10737 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010738 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010739}
10740
Benjamin Petersond5890c82012-01-14 13:23:30 -050010741PyDoc_STRVAR(casefold__doc__,
10742 "S.casefold() -> str\n\
10743\n\
10744Return a version of S suitable for caseless comparisons.");
10745
10746static PyObject *
10747unicode_casefold(PyObject *self)
10748{
10749 if (PyUnicode_READY(self) == -1)
10750 return NULL;
10751 if (PyUnicode_IS_ASCII(self))
10752 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010753 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010754}
10755
10756
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010757/* Argument converter. Coerces to a single unicode character */
10758
10759static int
10760convert_uc(PyObject *obj, void *addr)
10761{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010762 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010763 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010764
Benjamin Peterson14339b62009-01-31 16:36:08 +000010765 uniobj = PyUnicode_FromObject(obj);
10766 if (uniobj == NULL) {
10767 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010768 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010769 return 0;
10770 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010771 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010772 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010773 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010774 Py_DECREF(uniobj);
10775 return 0;
10776 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010777 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010778 Py_DECREF(uniobj);
10779 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010780}
10781
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010782PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010783 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010784\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010785Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010786done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010787
10788static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010789unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010790{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010791 Py_ssize_t marg, left;
10792 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010793 Py_UCS4 fillchar = ' ';
10794
Victor Stinnere9a29352011-10-01 02:14:59 +020010795 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010796 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010797
Benjamin Petersonbac79492012-01-14 13:34:47 -050010798 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010799 return NULL;
10800
Victor Stinnerc4b49542011-12-11 22:44:26 +010010801 if (PyUnicode_GET_LENGTH(self) >= width)
10802 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010803
Victor Stinnerc4b49542011-12-11 22:44:26 +010010804 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010805 left = marg / 2 + (marg & width & 1);
10806
Victor Stinner9310abb2011-10-05 00:59:23 +020010807 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010808}
10809
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010810/* This function assumes that str1 and str2 are readied by the caller. */
10811
Marc-André Lemburge5034372000-08-08 08:04:29 +000010812static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010813unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010814{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010815 int kind1, kind2;
10816 void *data1, *data2;
10817 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010818
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010819 kind1 = PyUnicode_KIND(str1);
10820 kind2 = PyUnicode_KIND(str2);
10821 data1 = PyUnicode_DATA(str1);
10822 data2 = PyUnicode_DATA(str2);
10823 len1 = PyUnicode_GET_LENGTH(str1);
10824 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010825
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010826 for (i = 0; i < len1 && i < len2; ++i) {
10827 Py_UCS4 c1, c2;
10828 c1 = PyUnicode_READ(kind1, data1, i);
10829 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010830
10831 if (c1 != c2)
10832 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010833 }
10834
10835 return (len1 < len2) ? -1 : (len1 != len2);
10836}
10837
Alexander Belopolsky40018472011-02-26 01:02:56 +000010838int
10839PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010840{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010841 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10842 if (PyUnicode_READY(left) == -1 ||
10843 PyUnicode_READY(right) == -1)
10844 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010845 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010846 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010847 PyErr_Format(PyExc_TypeError,
10848 "Can't compare %.100s and %.100s",
10849 left->ob_type->tp_name,
10850 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010851 return -1;
10852}
10853
Martin v. Löwis5b222132007-06-10 09:51:05 +000010854int
10855PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10856{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010857 Py_ssize_t i;
10858 int kind;
10859 void *data;
10860 Py_UCS4 chr;
10861
Victor Stinner910337b2011-10-03 03:20:16 +020010862 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010863 if (PyUnicode_READY(uni) == -1)
10864 return -1;
10865 kind = PyUnicode_KIND(uni);
10866 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010867 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010868 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10869 if (chr != str[i])
10870 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010871 /* This check keeps Python strings that end in '\0' from comparing equal
10872 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010873 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010874 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010875 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010876 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010877 return 0;
10878}
10879
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010880
Benjamin Peterson29060642009-01-31 22:14:21 +000010881#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010882 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010883
Alexander Belopolsky40018472011-02-26 01:02:56 +000010884PyObject *
10885PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010886{
10887 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010888
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010889 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10890 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010891 if (PyUnicode_READY(left) == -1 ||
10892 PyUnicode_READY(right) == -1)
10893 return NULL;
10894 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10895 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010896 if (op == Py_EQ) {
10897 Py_INCREF(Py_False);
10898 return Py_False;
10899 }
10900 if (op == Py_NE) {
10901 Py_INCREF(Py_True);
10902 return Py_True;
10903 }
10904 }
10905 if (left == right)
10906 result = 0;
10907 else
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010908 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010909
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010910 /* Convert the return value to a Boolean */
10911 switch (op) {
10912 case Py_EQ:
10913 v = TEST_COND(result == 0);
10914 break;
10915 case Py_NE:
10916 v = TEST_COND(result != 0);
10917 break;
10918 case Py_LE:
10919 v = TEST_COND(result <= 0);
10920 break;
10921 case Py_GE:
10922 v = TEST_COND(result >= 0);
10923 break;
10924 case Py_LT:
10925 v = TEST_COND(result == -1);
10926 break;
10927 case Py_GT:
10928 v = TEST_COND(result == 1);
10929 break;
10930 default:
10931 PyErr_BadArgument();
10932 return NULL;
10933 }
10934 Py_INCREF(v);
10935 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010936 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010937
Brian Curtindfc80e32011-08-10 20:28:54 -050010938 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010939}
10940
Alexander Belopolsky40018472011-02-26 01:02:56 +000010941int
10942PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010943{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010944 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010945 int kind1, kind2, kind;
10946 void *buf1, *buf2;
10947 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010948 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010949
10950 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010951 sub = PyUnicode_FromObject(element);
10952 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010953 PyErr_Format(PyExc_TypeError,
10954 "'in <string>' requires string as left operand, not %s",
10955 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010956 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010957 }
10958
Thomas Wouters477c8d52006-05-27 19:21:47 +000010959 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010960 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010961 Py_DECREF(sub);
10962 return -1;
10963 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060010964 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
10965 Py_DECREF(sub);
10966 Py_DECREF(str);
10967 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010968
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010969 kind1 = PyUnicode_KIND(str);
10970 kind2 = PyUnicode_KIND(sub);
10971 kind = kind1 > kind2 ? kind1 : kind2;
10972 buf1 = PyUnicode_DATA(str);
10973 buf2 = PyUnicode_DATA(sub);
10974 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010975 buf1 = _PyUnicode_AsKind(str, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010976 if (!buf1) {
10977 Py_DECREF(sub);
10978 return -1;
10979 }
10980 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010981 buf2 = _PyUnicode_AsKind(sub, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010982 if (!buf2) {
10983 Py_DECREF(sub);
10984 if (kind1 != kind) PyMem_Free(buf1);
10985 return -1;
10986 }
10987 len1 = PyUnicode_GET_LENGTH(str);
10988 len2 = PyUnicode_GET_LENGTH(sub);
10989
Benjamin Petersonead6b532011-12-20 17:23:42 -060010990 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010991 case PyUnicode_1BYTE_KIND:
10992 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10993 break;
10994 case PyUnicode_2BYTE_KIND:
10995 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10996 break;
10997 case PyUnicode_4BYTE_KIND:
10998 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10999 break;
11000 default:
11001 result = -1;
11002 assert(0);
11003 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011004
11005 Py_DECREF(str);
11006 Py_DECREF(sub);
11007
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011008 if (kind1 != kind)
11009 PyMem_Free(buf1);
11010 if (kind2 != kind)
11011 PyMem_Free(buf2);
11012
Guido van Rossum403d68b2000-03-13 15:55:09 +000011013 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011014}
11015
Guido van Rossumd57fd912000-03-10 22:53:23 +000011016/* Concat to string or Unicode object giving a new Unicode object. */
11017
Alexander Belopolsky40018472011-02-26 01:02:56 +000011018PyObject *
11019PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011020{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011021 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020011022 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010011023 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011024
11025 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011026 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011027 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011028 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011029 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011030 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011031 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011032
11033 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020011034 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011035 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011036 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011037 }
Victor Stinnera464fc12011-10-02 20:39:30 +020011038 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011039 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011040 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011041 }
11042
Victor Stinner488fa492011-12-12 00:01:39 +010011043 u_len = PyUnicode_GET_LENGTH(u);
11044 v_len = PyUnicode_GET_LENGTH(v);
11045 if (u_len > PY_SSIZE_T_MAX - v_len) {
11046 PyErr_SetString(PyExc_OverflowError,
11047 "strings are too large to concat");
11048 goto onError;
11049 }
11050 new_len = u_len + v_len;
11051
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011052 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020011053 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Victor Stinnere6abb482012-05-02 01:15:40 +020011054 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011055
Guido van Rossumd57fd912000-03-10 22:53:23 +000011056 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010011057 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011058 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011059 goto onError;
Victor Stinner488fa492011-12-12 00:01:39 +010011060 copy_characters(w, 0, u, 0, u_len);
11061 copy_characters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011062 Py_DECREF(u);
11063 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011064 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011065 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011066
Benjamin Peterson29060642009-01-31 22:14:21 +000011067 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011068 Py_XDECREF(u);
11069 Py_XDECREF(v);
11070 return NULL;
11071}
11072
Walter Dörwald1ab83302007-05-18 17:15:44 +000011073void
Victor Stinner23e56682011-10-03 03:54:37 +020011074PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011075{
Victor Stinner23e56682011-10-03 03:54:37 +020011076 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011077 Py_UCS4 maxchar, maxchar2;
11078 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011079
11080 if (p_left == NULL) {
11081 if (!PyErr_Occurred())
11082 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011083 return;
11084 }
Victor Stinner23e56682011-10-03 03:54:37 +020011085 left = *p_left;
11086 if (right == NULL || !PyUnicode_Check(left)) {
11087 if (!PyErr_Occurred())
11088 PyErr_BadInternalCall();
11089 goto error;
11090 }
11091
Benjamin Petersonbac79492012-01-14 13:34:47 -050011092 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011093 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011094 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011095 goto error;
11096
Victor Stinner488fa492011-12-12 00:01:39 +010011097 /* Shortcuts */
11098 if (left == unicode_empty) {
11099 Py_DECREF(left);
11100 Py_INCREF(right);
11101 *p_left = right;
11102 return;
11103 }
11104 if (right == unicode_empty)
11105 return;
11106
11107 left_len = PyUnicode_GET_LENGTH(left);
11108 right_len = PyUnicode_GET_LENGTH(right);
11109 if (left_len > PY_SSIZE_T_MAX - right_len) {
11110 PyErr_SetString(PyExc_OverflowError,
11111 "strings are too large to concat");
11112 goto error;
11113 }
11114 new_len = left_len + right_len;
11115
11116 if (unicode_modifiable(left)
11117 && PyUnicode_CheckExact(right)
11118 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011119 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11120 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011121 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011122 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011123 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11124 {
11125 /* append inplace */
11126 if (unicode_resize(p_left, new_len) != 0) {
11127 /* XXX if _PyUnicode_Resize() fails, 'left' has been
11128 * deallocated so it cannot be put back into
11129 * 'variable'. The MemoryError is raised when there
11130 * is no value in 'variable', which might (very
11131 * remotely) be a cause of incompatibilities.
11132 */
11133 goto error;
Victor Stinner23e56682011-10-03 03:54:37 +020011134 }
Victor Stinner488fa492011-12-12 00:01:39 +010011135 /* copy 'right' into the newly allocated area of 'left' */
11136 copy_characters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011137 }
Victor Stinner488fa492011-12-12 00:01:39 +010011138 else {
11139 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11140 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Victor Stinnere6abb482012-05-02 01:15:40 +020011141 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011142
Victor Stinner488fa492011-12-12 00:01:39 +010011143 /* Concat the two Unicode strings */
11144 res = PyUnicode_New(new_len, maxchar);
11145 if (res == NULL)
11146 goto error;
11147 copy_characters(res, 0, left, 0, left_len);
11148 copy_characters(res, left_len, right, 0, right_len);
11149 Py_DECREF(left);
11150 *p_left = res;
11151 }
11152 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011153 return;
11154
11155error:
Victor Stinner488fa492011-12-12 00:01:39 +010011156 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011157}
11158
11159void
11160PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11161{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011162 PyUnicode_Append(pleft, right);
11163 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011164}
11165
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011166PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011167 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011168\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011169Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011170string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011171interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011172
11173static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011174unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011175{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011176 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011177 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011178 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011179 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011180 int kind1, kind2, kind;
11181 void *buf1, *buf2;
11182 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011183
Jesus Ceaac451502011-04-20 17:09:23 +020011184 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
11185 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011186 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011187
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011188 kind1 = PyUnicode_KIND(self);
11189 kind2 = PyUnicode_KIND(substring);
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011190 if (kind2 > kind1)
11191 return PyLong_FromLong(0);
11192 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011193 buf1 = PyUnicode_DATA(self);
11194 buf2 = PyUnicode_DATA(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011195 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010011196 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011197 if (!buf2) {
11198 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011199 return NULL;
11200 }
11201 len1 = PyUnicode_GET_LENGTH(self);
11202 len2 = PyUnicode_GET_LENGTH(substring);
11203
11204 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060011205 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011206 case PyUnicode_1BYTE_KIND:
11207 iresult = ucs1lib_count(
11208 ((Py_UCS1*)buf1) + start, end - start,
11209 buf2, len2, PY_SSIZE_T_MAX
11210 );
11211 break;
11212 case PyUnicode_2BYTE_KIND:
11213 iresult = ucs2lib_count(
11214 ((Py_UCS2*)buf1) + start, end - start,
11215 buf2, len2, PY_SSIZE_T_MAX
11216 );
11217 break;
11218 case PyUnicode_4BYTE_KIND:
11219 iresult = ucs4lib_count(
11220 ((Py_UCS4*)buf1) + start, end - start,
11221 buf2, len2, PY_SSIZE_T_MAX
11222 );
11223 break;
11224 default:
11225 assert(0); iresult = 0;
11226 }
11227
11228 result = PyLong_FromSsize_t(iresult);
11229
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011230 if (kind2 != kind)
11231 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011232
11233 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011234
Guido van Rossumd57fd912000-03-10 22:53:23 +000011235 return result;
11236}
11237
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011238PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011239 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011240\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011241Encode S using the codec registered for encoding. Default encoding\n\
11242is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011243handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011244a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11245'xmlcharrefreplace' as well as any other name registered with\n\
11246codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011247
11248static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011249unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011250{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011251 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011252 char *encoding = NULL;
11253 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011254
Benjamin Peterson308d6372009-09-18 21:42:35 +000011255 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11256 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011257 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011258 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011259}
11260
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011261PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011262 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011263\n\
11264Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011265If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011266
11267static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011268unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011269{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011270 Py_ssize_t i, j, line_pos, src_len, incr;
11271 Py_UCS4 ch;
11272 PyObject *u;
11273 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011274 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011275 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011276 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011277
11278 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011279 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011280
Antoine Pitrou22425222011-10-04 19:10:51 +020011281 if (PyUnicode_READY(self) == -1)
11282 return NULL;
11283
Thomas Wouters7e474022000-07-16 12:04:32 +000011284 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011285 src_len = PyUnicode_GET_LENGTH(self);
11286 i = j = line_pos = 0;
11287 kind = PyUnicode_KIND(self);
11288 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011289 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011290 for (; i < src_len; i++) {
11291 ch = PyUnicode_READ(kind, src_data, i);
11292 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011293 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011294 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011295 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011296 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011297 goto overflow;
11298 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011299 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011300 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011301 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011302 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011303 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011304 goto overflow;
11305 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011306 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011307 if (ch == '\n' || ch == '\r')
11308 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011309 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011310 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011311 if (!found)
11312 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011313
Guido van Rossumd57fd912000-03-10 22:53:23 +000011314 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011315 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011316 if (!u)
11317 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011318 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011319
Antoine Pitroue71d5742011-10-04 15:55:09 +020011320 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011321
Antoine Pitroue71d5742011-10-04 15:55:09 +020011322 for (; i < src_len; i++) {
11323 ch = PyUnicode_READ(kind, src_data, i);
11324 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011325 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011326 incr = tabsize - (line_pos % tabsize);
11327 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011328 FILL(kind, dest_data, ' ', j, incr);
11329 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011330 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011331 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011332 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011333 line_pos++;
11334 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011335 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011336 if (ch == '\n' || ch == '\r')
11337 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011338 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011339 }
11340 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011341 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011342
Antoine Pitroue71d5742011-10-04 15:55:09 +020011343 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011344 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11345 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011346}
11347
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011348PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011349 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011350\n\
11351Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011352such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011353arguments start and end are interpreted as in slice notation.\n\
11354\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011355Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011356
11357static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011358unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011359{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011360 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011361 Py_ssize_t start;
11362 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011363 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011364
Jesus Ceaac451502011-04-20 17:09:23 +020011365 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11366 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011367 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011368
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011369 if (PyUnicode_READY(self) == -1)
11370 return NULL;
11371 if (PyUnicode_READY(substring) == -1)
11372 return NULL;
11373
Victor Stinner7931d9a2011-11-04 00:22:48 +010011374 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011375
11376 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011377
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011378 if (result == -2)
11379 return NULL;
11380
Christian Heimes217cfd12007-12-02 14:31:20 +000011381 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011382}
11383
11384static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011385unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011386{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011387 void *data;
11388 enum PyUnicode_Kind kind;
11389 Py_UCS4 ch;
11390 PyObject *res;
11391
11392 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11393 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011394 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011395 }
11396 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11397 PyErr_SetString(PyExc_IndexError, "string index out of range");
11398 return NULL;
11399 }
11400 kind = PyUnicode_KIND(self);
11401 data = PyUnicode_DATA(self);
11402 ch = PyUnicode_READ(kind, data, index);
11403 if (ch < 256)
11404 return get_latin1_char(ch);
11405
11406 res = PyUnicode_New(1, ch);
11407 if (res == NULL)
11408 return NULL;
11409 kind = PyUnicode_KIND(res);
11410 data = PyUnicode_DATA(res);
11411 PyUnicode_WRITE(kind, data, 0, ch);
11412 assert(_PyUnicode_CheckConsistency(res, 1));
11413 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011414}
11415
Guido van Rossumc2504932007-09-18 19:42:40 +000011416/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011417 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011418static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011419unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011420{
Guido van Rossumc2504932007-09-18 19:42:40 +000011421 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010011422 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011423
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011424#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011425 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011426#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011427 if (_PyUnicode_HASH(self) != -1)
11428 return _PyUnicode_HASH(self);
11429 if (PyUnicode_READY(self) == -1)
11430 return -1;
11431 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011432 /*
11433 We make the hash of the empty string be 0, rather than using
11434 (prefix ^ suffix), since this slightly obfuscates the hash secret
11435 */
11436 if (len == 0) {
11437 _PyUnicode_HASH(self) = 0;
11438 return 0;
11439 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011440
11441 /* The hash function as a macro, gets expanded three times below. */
Georg Brandl2fb477c2012-02-21 00:33:36 +010011442#define HASH(P) \
11443 x ^= (Py_uhash_t) *P << 7; \
11444 while (--len >= 0) \
11445 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011446
Georg Brandl2fb477c2012-02-21 00:33:36 +010011447 x = (Py_uhash_t) _Py_HashSecret.prefix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011448 switch (PyUnicode_KIND(self)) {
11449 case PyUnicode_1BYTE_KIND: {
11450 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11451 HASH(c);
11452 break;
11453 }
11454 case PyUnicode_2BYTE_KIND: {
11455 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11456 HASH(s);
11457 break;
11458 }
11459 default: {
11460 Py_UCS4 *l;
11461 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11462 "Impossible switch case in unicode_hash");
11463 l = PyUnicode_4BYTE_DATA(self);
11464 HASH(l);
11465 break;
11466 }
11467 }
Georg Brandl2fb477c2012-02-21 00:33:36 +010011468 x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
11469 x ^= (Py_uhash_t) _Py_HashSecret.suffix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011470
Guido van Rossumc2504932007-09-18 19:42:40 +000011471 if (x == -1)
11472 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011473 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011474 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011475}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011476#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011477
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011478PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011479 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011480\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011481Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011482
11483static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011484unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011485{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011486 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011487 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011488 Py_ssize_t start;
11489 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011490
Jesus Ceaac451502011-04-20 17:09:23 +020011491 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11492 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011493 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011494
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011495 if (PyUnicode_READY(self) == -1)
11496 return NULL;
11497 if (PyUnicode_READY(substring) == -1)
11498 return NULL;
11499
Victor Stinner7931d9a2011-11-04 00:22:48 +010011500 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011501
11502 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011503
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011504 if (result == -2)
11505 return NULL;
11506
Guido van Rossumd57fd912000-03-10 22:53:23 +000011507 if (result < 0) {
11508 PyErr_SetString(PyExc_ValueError, "substring not found");
11509 return NULL;
11510 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011511
Christian Heimes217cfd12007-12-02 14:31:20 +000011512 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011513}
11514
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011515PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011516 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011517\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011518Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011519at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011520
11521static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011522unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011523{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011524 Py_ssize_t i, length;
11525 int kind;
11526 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011527 int cased;
11528
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011529 if (PyUnicode_READY(self) == -1)
11530 return NULL;
11531 length = PyUnicode_GET_LENGTH(self);
11532 kind = PyUnicode_KIND(self);
11533 data = PyUnicode_DATA(self);
11534
Guido van Rossumd57fd912000-03-10 22:53:23 +000011535 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011536 if (length == 1)
11537 return PyBool_FromLong(
11538 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011539
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011540 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011541 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011542 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011543
Guido van Rossumd57fd912000-03-10 22:53:23 +000011544 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011545 for (i = 0; i < length; i++) {
11546 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011547
Benjamin Peterson29060642009-01-31 22:14:21 +000011548 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11549 return PyBool_FromLong(0);
11550 else if (!cased && Py_UNICODE_ISLOWER(ch))
11551 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011552 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011553 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011554}
11555
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011556PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011557 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011558\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011559Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011560at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011561
11562static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011563unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011564{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011565 Py_ssize_t i, length;
11566 int kind;
11567 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011568 int cased;
11569
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011570 if (PyUnicode_READY(self) == -1)
11571 return NULL;
11572 length = PyUnicode_GET_LENGTH(self);
11573 kind = PyUnicode_KIND(self);
11574 data = PyUnicode_DATA(self);
11575
Guido van Rossumd57fd912000-03-10 22:53:23 +000011576 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011577 if (length == 1)
11578 return PyBool_FromLong(
11579 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011580
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011581 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011582 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011583 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011584
Guido van Rossumd57fd912000-03-10 22:53:23 +000011585 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011586 for (i = 0; i < length; i++) {
11587 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011588
Benjamin Peterson29060642009-01-31 22:14:21 +000011589 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11590 return PyBool_FromLong(0);
11591 else if (!cased && Py_UNICODE_ISUPPER(ch))
11592 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011593 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011594 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011595}
11596
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011597PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011598 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011599\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011600Return True if S is a titlecased string and there is at least one\n\
11601character in S, i.e. upper- and titlecase characters may only\n\
11602follow uncased characters and lowercase characters only cased ones.\n\
11603Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011604
11605static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011606unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011607{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011608 Py_ssize_t i, length;
11609 int kind;
11610 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011611 int cased, previous_is_cased;
11612
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011613 if (PyUnicode_READY(self) == -1)
11614 return NULL;
11615 length = PyUnicode_GET_LENGTH(self);
11616 kind = PyUnicode_KIND(self);
11617 data = PyUnicode_DATA(self);
11618
Guido van Rossumd57fd912000-03-10 22:53:23 +000011619 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011620 if (length == 1) {
11621 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11622 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11623 (Py_UNICODE_ISUPPER(ch) != 0));
11624 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011625
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011626 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011627 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011628 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011629
Guido van Rossumd57fd912000-03-10 22:53:23 +000011630 cased = 0;
11631 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011632 for (i = 0; i < length; i++) {
11633 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011634
Benjamin Peterson29060642009-01-31 22:14:21 +000011635 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11636 if (previous_is_cased)
11637 return PyBool_FromLong(0);
11638 previous_is_cased = 1;
11639 cased = 1;
11640 }
11641 else if (Py_UNICODE_ISLOWER(ch)) {
11642 if (!previous_is_cased)
11643 return PyBool_FromLong(0);
11644 previous_is_cased = 1;
11645 cased = 1;
11646 }
11647 else
11648 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011649 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011650 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011651}
11652
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011653PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011654 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011655\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011656Return True if all characters in S are whitespace\n\
11657and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011658
11659static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011660unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011661{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011662 Py_ssize_t i, length;
11663 int kind;
11664 void *data;
11665
11666 if (PyUnicode_READY(self) == -1)
11667 return NULL;
11668 length = PyUnicode_GET_LENGTH(self);
11669 kind = PyUnicode_KIND(self);
11670 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011671
Guido van Rossumd57fd912000-03-10 22:53:23 +000011672 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011673 if (length == 1)
11674 return PyBool_FromLong(
11675 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011676
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011677 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011678 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011679 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011680
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011681 for (i = 0; i < length; i++) {
11682 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011683 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011684 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011685 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011686 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011687}
11688
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011689PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011690 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011691\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011692Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011693and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011694
11695static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011696unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011697{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011698 Py_ssize_t i, length;
11699 int kind;
11700 void *data;
11701
11702 if (PyUnicode_READY(self) == -1)
11703 return NULL;
11704 length = PyUnicode_GET_LENGTH(self);
11705 kind = PyUnicode_KIND(self);
11706 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011707
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011708 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011709 if (length == 1)
11710 return PyBool_FromLong(
11711 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011712
11713 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011714 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011715 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011716
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011717 for (i = 0; i < length; i++) {
11718 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011719 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011720 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011721 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011722}
11723
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011724PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011725 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011726\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011727Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011728and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011729
11730static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011731unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011732{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011733 int kind;
11734 void *data;
11735 Py_ssize_t len, i;
11736
11737 if (PyUnicode_READY(self) == -1)
11738 return NULL;
11739
11740 kind = PyUnicode_KIND(self);
11741 data = PyUnicode_DATA(self);
11742 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011743
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011744 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011745 if (len == 1) {
11746 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11747 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11748 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011749
11750 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011751 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011752 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011753
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011754 for (i = 0; i < len; i++) {
11755 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011756 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011757 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011758 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011759 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011760}
11761
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011762PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011763 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011764\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011765Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011766False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011767
11768static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011769unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011770{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011771 Py_ssize_t i, length;
11772 int kind;
11773 void *data;
11774
11775 if (PyUnicode_READY(self) == -1)
11776 return NULL;
11777 length = PyUnicode_GET_LENGTH(self);
11778 kind = PyUnicode_KIND(self);
11779 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011780
Guido van Rossumd57fd912000-03-10 22:53:23 +000011781 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011782 if (length == 1)
11783 return PyBool_FromLong(
11784 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011785
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011786 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011787 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011788 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011789
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011790 for (i = 0; i < length; i++) {
11791 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011792 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011793 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011794 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011795}
11796
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011797PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011798 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011799\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011800Return True if all characters in S are digits\n\
11801and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011802
11803static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011804unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011805{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011806 Py_ssize_t i, length;
11807 int kind;
11808 void *data;
11809
11810 if (PyUnicode_READY(self) == -1)
11811 return NULL;
11812 length = PyUnicode_GET_LENGTH(self);
11813 kind = PyUnicode_KIND(self);
11814 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011815
Guido van Rossumd57fd912000-03-10 22:53:23 +000011816 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011817 if (length == 1) {
11818 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11819 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11820 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011821
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011822 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011823 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011824 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011825
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011826 for (i = 0; i < length; i++) {
11827 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011828 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011829 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011830 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011831}
11832
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011833PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011834 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011835\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011836Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011837False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011838
11839static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011840unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011841{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011842 Py_ssize_t i, length;
11843 int kind;
11844 void *data;
11845
11846 if (PyUnicode_READY(self) == -1)
11847 return NULL;
11848 length = PyUnicode_GET_LENGTH(self);
11849 kind = PyUnicode_KIND(self);
11850 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011851
Guido van Rossumd57fd912000-03-10 22:53:23 +000011852 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011853 if (length == 1)
11854 return PyBool_FromLong(
11855 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011856
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011857 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011858 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011859 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011860
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011861 for (i = 0; i < length; i++) {
11862 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011863 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011864 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011865 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011866}
11867
Martin v. Löwis47383402007-08-15 07:32:56 +000011868int
11869PyUnicode_IsIdentifier(PyObject *self)
11870{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011871 int kind;
11872 void *data;
11873 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011874 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011875
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011876 if (PyUnicode_READY(self) == -1) {
11877 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011878 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011879 }
11880
11881 /* Special case for empty strings */
11882 if (PyUnicode_GET_LENGTH(self) == 0)
11883 return 0;
11884 kind = PyUnicode_KIND(self);
11885 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011886
11887 /* PEP 3131 says that the first character must be in
11888 XID_Start and subsequent characters in XID_Continue,
11889 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011890 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011891 letters, digits, underscore). However, given the current
11892 definition of XID_Start and XID_Continue, it is sufficient
11893 to check just for these, except that _ must be allowed
11894 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011895 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011896 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011897 return 0;
11898
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011899 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011900 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011901 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011902 return 1;
11903}
11904
11905PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011906 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011907\n\
11908Return True if S is a valid identifier according\n\
11909to the language definition.");
11910
11911static PyObject*
11912unicode_isidentifier(PyObject *self)
11913{
11914 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11915}
11916
Georg Brandl559e5d72008-06-11 18:37:52 +000011917PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011918 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011919\n\
11920Return True if all characters in S are considered\n\
11921printable in repr() or S is empty, False otherwise.");
11922
11923static PyObject*
11924unicode_isprintable(PyObject *self)
11925{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011926 Py_ssize_t i, length;
11927 int kind;
11928 void *data;
11929
11930 if (PyUnicode_READY(self) == -1)
11931 return NULL;
11932 length = PyUnicode_GET_LENGTH(self);
11933 kind = PyUnicode_KIND(self);
11934 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011935
11936 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011937 if (length == 1)
11938 return PyBool_FromLong(
11939 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011940
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011941 for (i = 0; i < length; i++) {
11942 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011943 Py_RETURN_FALSE;
11944 }
11945 }
11946 Py_RETURN_TRUE;
11947}
11948
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011949PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011950 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011951\n\
11952Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011953iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011954
11955static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011956unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011957{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011958 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011959}
11960
Martin v. Löwis18e16552006-02-15 17:27:45 +000011961static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011962unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011963{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011964 if (PyUnicode_READY(self) == -1)
11965 return -1;
11966 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011967}
11968
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011969PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011970 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011971\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011972Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011973done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011974
11975static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011976unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011977{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011978 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011979 Py_UCS4 fillchar = ' ';
11980
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011981 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011982 return NULL;
11983
Benjamin Petersonbac79492012-01-14 13:34:47 -050011984 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011985 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011986
Victor Stinnerc4b49542011-12-11 22:44:26 +010011987 if (PyUnicode_GET_LENGTH(self) >= width)
11988 return unicode_result_unchanged(self);
11989
11990 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011991}
11992
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011993PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011994 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011995\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011996Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011997
11998static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011999unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012000{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012001 if (PyUnicode_READY(self) == -1)
12002 return NULL;
12003 if (PyUnicode_IS_ASCII(self))
12004 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012005 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012006}
12007
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012008#define LEFTSTRIP 0
12009#define RIGHTSTRIP 1
12010#define BOTHSTRIP 2
12011
12012/* Arrays indexed by above */
12013static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
12014
12015#define STRIPNAME(i) (stripformat[i]+3)
12016
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012017/* externally visible for str.strip(unicode) */
12018PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012019_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012020{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012021 void *data;
12022 int kind;
12023 Py_ssize_t i, j, len;
12024 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012025
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012026 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12027 return NULL;
12028
12029 kind = PyUnicode_KIND(self);
12030 data = PyUnicode_DATA(self);
12031 len = PyUnicode_GET_LENGTH(self);
12032 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12033 PyUnicode_DATA(sepobj),
12034 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000012035
Benjamin Peterson14339b62009-01-31 16:36:08 +000012036 i = 0;
12037 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012038 while (i < len &&
12039 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012040 i++;
12041 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012042 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012043
Benjamin Peterson14339b62009-01-31 16:36:08 +000012044 j = len;
12045 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012046 do {
12047 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012048 } while (j >= i &&
12049 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000012050 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012051 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012052
Victor Stinner7931d9a2011-11-04 00:22:48 +010012053 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012054}
12055
12056PyObject*
12057PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12058{
12059 unsigned char *data;
12060 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012061 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012062
Victor Stinnerde636f32011-10-01 03:55:54 +020012063 if (PyUnicode_READY(self) == -1)
12064 return NULL;
12065
Victor Stinner684d5fd2012-05-03 02:32:34 +020012066 length = PyUnicode_GET_LENGTH(self);
12067 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012068
Victor Stinner684d5fd2012-05-03 02:32:34 +020012069 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012070 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012071
Victor Stinnerde636f32011-10-01 03:55:54 +020012072 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012073 PyErr_SetString(PyExc_IndexError, "string index out of range");
12074 return NULL;
12075 }
Victor Stinner684d5fd2012-05-03 02:32:34 +020012076 if (start >= length || end < start) {
Victor Stinner3a7f79772012-05-03 03:36:40 +020012077 Py_INCREF(unicode_empty);
12078 return unicode_empty;
Victor Stinner684d5fd2012-05-03 02:32:34 +020012079 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020012080
Victor Stinner684d5fd2012-05-03 02:32:34 +020012081 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012082 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012083 data = PyUnicode_1BYTE_DATA(self);
12084 return unicode_fromascii(data + start, length);
12085 }
12086 else {
12087 kind = PyUnicode_KIND(self);
12088 data = PyUnicode_1BYTE_DATA(self);
12089 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012090 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012091 length);
12092 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012093}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012094
12095static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012096do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012097{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012098 int kind;
12099 void *data;
12100 Py_ssize_t len, i, j;
12101
12102 if (PyUnicode_READY(self) == -1)
12103 return NULL;
12104
12105 kind = PyUnicode_KIND(self);
12106 data = PyUnicode_DATA(self);
12107 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012108
Benjamin Peterson14339b62009-01-31 16:36:08 +000012109 i = 0;
12110 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012111 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012112 i++;
12113 }
12114 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012115
Benjamin Peterson14339b62009-01-31 16:36:08 +000012116 j = len;
12117 if (striptype != LEFTSTRIP) {
12118 do {
12119 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012120 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012121 j++;
12122 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012123
Victor Stinner7931d9a2011-11-04 00:22:48 +010012124 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012125}
12126
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012127
12128static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012129do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012130{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012131 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012132
Benjamin Peterson14339b62009-01-31 16:36:08 +000012133 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
12134 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012135
Benjamin Peterson14339b62009-01-31 16:36:08 +000012136 if (sep != NULL && sep != Py_None) {
12137 if (PyUnicode_Check(sep))
12138 return _PyUnicode_XStrip(self, striptype, sep);
12139 else {
12140 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012141 "%s arg must be None or str",
12142 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012143 return NULL;
12144 }
12145 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012146
Benjamin Peterson14339b62009-01-31 16:36:08 +000012147 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012148}
12149
12150
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012151PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012152 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012153\n\
12154Return a copy of the string S with leading and trailing\n\
12155whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012156If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012157
12158static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012159unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012160{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012161 if (PyTuple_GET_SIZE(args) == 0)
12162 return do_strip(self, BOTHSTRIP); /* Common case */
12163 else
12164 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012165}
12166
12167
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012168PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012169 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012170\n\
12171Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012172If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012173
12174static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012175unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012176{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012177 if (PyTuple_GET_SIZE(args) == 0)
12178 return do_strip(self, LEFTSTRIP); /* Common case */
12179 else
12180 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012181}
12182
12183
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012184PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012185 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012186\n\
12187Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012188If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012189
12190static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012191unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012192{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012193 if (PyTuple_GET_SIZE(args) == 0)
12194 return do_strip(self, RIGHTSTRIP); /* Common case */
12195 else
12196 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012197}
12198
12199
Guido van Rossumd57fd912000-03-10 22:53:23 +000012200static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012201unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012202{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012203 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012204 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012205
Georg Brandl222de0f2009-04-12 12:01:50 +000012206 if (len < 1) {
12207 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020012208 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000012209 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012210
Victor Stinnerc4b49542011-12-11 22:44:26 +010012211 /* no repeat, return original string */
12212 if (len == 1)
12213 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012214
Benjamin Petersonbac79492012-01-14 13:34:47 -050012215 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012216 return NULL;
12217
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012218 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012219 PyErr_SetString(PyExc_OverflowError,
12220 "repeated string is too long");
12221 return NULL;
12222 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012223 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012224
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012225 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012226 if (!u)
12227 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012228 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012229
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012230 if (PyUnicode_GET_LENGTH(str) == 1) {
12231 const int kind = PyUnicode_KIND(str);
12232 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012233 if (kind == PyUnicode_1BYTE_KIND) {
12234 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012235 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012236 }
12237 else if (kind == PyUnicode_2BYTE_KIND) {
12238 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012239 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012240 ucs2[n] = fill_char;
12241 } else {
12242 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12243 assert(kind == PyUnicode_4BYTE_KIND);
12244 for (n = 0; n < len; ++n)
12245 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012246 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012247 }
12248 else {
12249 /* number of characters copied this far */
12250 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012251 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012252 char *to = (char *) PyUnicode_DATA(u);
12253 Py_MEMCPY(to, PyUnicode_DATA(str),
12254 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012255 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012256 n = (done <= nchars-done) ? done : nchars-done;
12257 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012258 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012259 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012260 }
12261
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012262 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012263 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012264}
12265
Alexander Belopolsky40018472011-02-26 01:02:56 +000012266PyObject *
12267PyUnicode_Replace(PyObject *obj,
12268 PyObject *subobj,
12269 PyObject *replobj,
12270 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012271{
12272 PyObject *self;
12273 PyObject *str1;
12274 PyObject *str2;
12275 PyObject *result;
12276
12277 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012278 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012279 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012280 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012281 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012282 Py_DECREF(self);
12283 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012284 }
12285 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012286 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012287 Py_DECREF(self);
12288 Py_DECREF(str1);
12289 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012290 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012291 if (PyUnicode_READY(self) == -1 ||
12292 PyUnicode_READY(str1) == -1 ||
12293 PyUnicode_READY(str2) == -1)
12294 result = NULL;
12295 else
12296 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012297 Py_DECREF(self);
12298 Py_DECREF(str1);
12299 Py_DECREF(str2);
12300 return result;
12301}
12302
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012303PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012304 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012305\n\
12306Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012307old replaced by new. If the optional argument count is\n\
12308given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012309
12310static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012311unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012312{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012313 PyObject *str1;
12314 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012315 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012316 PyObject *result;
12317
Martin v. Löwis18e16552006-02-15 17:27:45 +000012318 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012319 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012320 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012321 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012322 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012323 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012324 return NULL;
12325 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012326 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012327 Py_DECREF(str1);
12328 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000012329 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012330 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
12331 result = NULL;
12332 else
12333 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012334
12335 Py_DECREF(str1);
12336 Py_DECREF(str2);
12337 return result;
12338}
12339
Alexander Belopolsky40018472011-02-26 01:02:56 +000012340static PyObject *
12341unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012342{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012343 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012344 Py_ssize_t isize;
12345 Py_ssize_t osize, squote, dquote, i, o;
12346 Py_UCS4 max, quote;
12347 int ikind, okind;
12348 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012349
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012350 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012351 return NULL;
12352
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012353 isize = PyUnicode_GET_LENGTH(unicode);
12354 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012355
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012356 /* Compute length of output, quote characters, and
12357 maximum character */
12358 osize = 2; /* quotes */
12359 max = 127;
12360 squote = dquote = 0;
12361 ikind = PyUnicode_KIND(unicode);
12362 for (i = 0; i < isize; i++) {
12363 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12364 switch (ch) {
12365 case '\'': squote++; osize++; break;
12366 case '"': dquote++; osize++; break;
12367 case '\\': case '\t': case '\r': case '\n':
12368 osize += 2; break;
12369 default:
12370 /* Fast-path ASCII */
12371 if (ch < ' ' || ch == 0x7f)
12372 osize += 4; /* \xHH */
12373 else if (ch < 0x7f)
12374 osize++;
12375 else if (Py_UNICODE_ISPRINTABLE(ch)) {
12376 osize++;
12377 max = ch > max ? ch : max;
12378 }
12379 else if (ch < 0x100)
12380 osize += 4; /* \xHH */
12381 else if (ch < 0x10000)
12382 osize += 6; /* \uHHHH */
12383 else
12384 osize += 10; /* \uHHHHHHHH */
12385 }
12386 }
12387
12388 quote = '\'';
12389 if (squote) {
12390 if (dquote)
12391 /* Both squote and dquote present. Use squote,
12392 and escape them */
12393 osize += squote;
12394 else
12395 quote = '"';
12396 }
12397
12398 repr = PyUnicode_New(osize, max);
12399 if (repr == NULL)
12400 return NULL;
12401 okind = PyUnicode_KIND(repr);
12402 odata = PyUnicode_DATA(repr);
12403
12404 PyUnicode_WRITE(okind, odata, 0, quote);
12405 PyUnicode_WRITE(okind, odata, osize-1, quote);
12406
12407 for (i = 0, o = 1; i < isize; i++) {
12408 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012409
12410 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012411 if ((ch == quote) || (ch == '\\')) {
12412 PyUnicode_WRITE(okind, odata, o++, '\\');
12413 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012414 continue;
12415 }
12416
Benjamin Peterson29060642009-01-31 22:14:21 +000012417 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012418 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012419 PyUnicode_WRITE(okind, odata, o++, '\\');
12420 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012421 }
12422 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012423 PyUnicode_WRITE(okind, odata, o++, '\\');
12424 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012425 }
12426 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012427 PyUnicode_WRITE(okind, odata, o++, '\\');
12428 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012429 }
12430
12431 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012432 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012433 PyUnicode_WRITE(okind, odata, o++, '\\');
12434 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012435 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12436 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012437 }
12438
Georg Brandl559e5d72008-06-11 18:37:52 +000012439 /* Copy ASCII characters as-is */
12440 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012441 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012442 }
12443
Benjamin Peterson29060642009-01-31 22:14:21 +000012444 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000012445 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012446 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000012447 (categories Z* and C* except ASCII space)
12448 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012449 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012450 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012451 if (ch <= 0xff) {
12452 PyUnicode_WRITE(okind, odata, o++, '\\');
12453 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012454 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12455 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012456 }
12457 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012458 else if (ch >= 0x10000) {
12459 PyUnicode_WRITE(okind, odata, o++, '\\');
12460 PyUnicode_WRITE(okind, odata, o++, 'U');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012461 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12462 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12463 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12464 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12465 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12466 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12467 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12468 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012469 }
12470 /* Map 16-bit characters to '\uxxxx' */
12471 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012472 PyUnicode_WRITE(okind, odata, o++, '\\');
12473 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012474 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12475 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12476 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12477 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012478 }
12479 }
12480 /* Copy characters as-is */
12481 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012482 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012483 }
12484 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012485 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012486 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012487 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012488 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012489}
12490
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012491PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012492 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012493\n\
12494Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012495such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012496arguments start and end are interpreted as in slice notation.\n\
12497\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012498Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012499
12500static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012501unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012502{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012503 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012504 Py_ssize_t start;
12505 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012506 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012507
Jesus Ceaac451502011-04-20 17:09:23 +020012508 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12509 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012510 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012511
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012512 if (PyUnicode_READY(self) == -1)
12513 return NULL;
12514 if (PyUnicode_READY(substring) == -1)
12515 return NULL;
12516
Victor Stinner7931d9a2011-11-04 00:22:48 +010012517 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012518
12519 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012520
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012521 if (result == -2)
12522 return NULL;
12523
Christian Heimes217cfd12007-12-02 14:31:20 +000012524 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012525}
12526
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012527PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012528 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012529\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012530Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012531
12532static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012533unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012534{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012535 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012536 Py_ssize_t start;
12537 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012538 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012539
Jesus Ceaac451502011-04-20 17:09:23 +020012540 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12541 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012542 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012543
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012544 if (PyUnicode_READY(self) == -1)
12545 return NULL;
12546 if (PyUnicode_READY(substring) == -1)
12547 return NULL;
12548
Victor Stinner7931d9a2011-11-04 00:22:48 +010012549 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012550
12551 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012552
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012553 if (result == -2)
12554 return NULL;
12555
Guido van Rossumd57fd912000-03-10 22:53:23 +000012556 if (result < 0) {
12557 PyErr_SetString(PyExc_ValueError, "substring not found");
12558 return NULL;
12559 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012560
Christian Heimes217cfd12007-12-02 14:31:20 +000012561 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012562}
12563
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012564PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012565 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012566\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012567Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012568done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012569
12570static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012571unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012572{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012573 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012574 Py_UCS4 fillchar = ' ';
12575
Victor Stinnere9a29352011-10-01 02:14:59 +020012576 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012577 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012578
Benjamin Petersonbac79492012-01-14 13:34:47 -050012579 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012580 return NULL;
12581
Victor Stinnerc4b49542011-12-11 22:44:26 +010012582 if (PyUnicode_GET_LENGTH(self) >= width)
12583 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012584
Victor Stinnerc4b49542011-12-11 22:44:26 +010012585 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012586}
12587
Alexander Belopolsky40018472011-02-26 01:02:56 +000012588PyObject *
12589PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012590{
12591 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012592
Guido van Rossumd57fd912000-03-10 22:53:23 +000012593 s = PyUnicode_FromObject(s);
12594 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012595 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012596 if (sep != NULL) {
12597 sep = PyUnicode_FromObject(sep);
12598 if (sep == NULL) {
12599 Py_DECREF(s);
12600 return NULL;
12601 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012602 }
12603
Victor Stinner9310abb2011-10-05 00:59:23 +020012604 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012605
12606 Py_DECREF(s);
12607 Py_XDECREF(sep);
12608 return result;
12609}
12610
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012611PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012612 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012613\n\
12614Return a list of the words in S, using sep as the\n\
12615delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012616splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012617whitespace string is a separator and empty strings are\n\
12618removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012619
12620static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012621unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012622{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012623 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012624 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012625 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012626
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012627 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12628 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012629 return NULL;
12630
12631 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012632 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012633 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012634 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012635 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012636 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012637}
12638
Thomas Wouters477c8d52006-05-27 19:21:47 +000012639PyObject *
12640PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12641{
12642 PyObject* str_obj;
12643 PyObject* sep_obj;
12644 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012645 int kind1, kind2, kind;
12646 void *buf1 = NULL, *buf2 = NULL;
12647 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012648
12649 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012650 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012651 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012652 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012653 if (!sep_obj) {
12654 Py_DECREF(str_obj);
12655 return NULL;
12656 }
12657 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12658 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012659 Py_DECREF(str_obj);
12660 return NULL;
12661 }
12662
Victor Stinner14f8f022011-10-05 20:58:25 +020012663 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012664 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012665 kind = Py_MAX(kind1, kind2);
12666 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012667 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012668 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012669 if (!buf1)
12670 goto onError;
12671 buf2 = PyUnicode_DATA(sep_obj);
12672 if (kind2 != kind)
12673 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12674 if (!buf2)
12675 goto onError;
12676 len1 = PyUnicode_GET_LENGTH(str_obj);
12677 len2 = PyUnicode_GET_LENGTH(sep_obj);
12678
Benjamin Petersonead6b532011-12-20 17:23:42 -060012679 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012680 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012681 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12682 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12683 else
12684 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012685 break;
12686 case PyUnicode_2BYTE_KIND:
12687 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12688 break;
12689 case PyUnicode_4BYTE_KIND:
12690 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12691 break;
12692 default:
12693 assert(0);
12694 out = 0;
12695 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012696
12697 Py_DECREF(sep_obj);
12698 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012699 if (kind1 != kind)
12700 PyMem_Free(buf1);
12701 if (kind2 != kind)
12702 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012703
12704 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012705 onError:
12706 Py_DECREF(sep_obj);
12707 Py_DECREF(str_obj);
12708 if (kind1 != kind && buf1)
12709 PyMem_Free(buf1);
12710 if (kind2 != kind && buf2)
12711 PyMem_Free(buf2);
12712 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012713}
12714
12715
12716PyObject *
12717PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12718{
12719 PyObject* str_obj;
12720 PyObject* sep_obj;
12721 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012722 int kind1, kind2, kind;
12723 void *buf1 = NULL, *buf2 = NULL;
12724 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012725
12726 str_obj = PyUnicode_FromObject(str_in);
12727 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012728 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012729 sep_obj = PyUnicode_FromObject(sep_in);
12730 if (!sep_obj) {
12731 Py_DECREF(str_obj);
12732 return NULL;
12733 }
12734
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012735 kind1 = PyUnicode_KIND(str_in);
12736 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012737 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012738 buf1 = PyUnicode_DATA(str_in);
12739 if (kind1 != kind)
12740 buf1 = _PyUnicode_AsKind(str_in, kind);
12741 if (!buf1)
12742 goto onError;
12743 buf2 = PyUnicode_DATA(sep_obj);
12744 if (kind2 != kind)
12745 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12746 if (!buf2)
12747 goto onError;
12748 len1 = PyUnicode_GET_LENGTH(str_obj);
12749 len2 = PyUnicode_GET_LENGTH(sep_obj);
12750
Benjamin Petersonead6b532011-12-20 17:23:42 -060012751 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012752 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012753 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12754 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12755 else
12756 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012757 break;
12758 case PyUnicode_2BYTE_KIND:
12759 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12760 break;
12761 case PyUnicode_4BYTE_KIND:
12762 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12763 break;
12764 default:
12765 assert(0);
12766 out = 0;
12767 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012768
12769 Py_DECREF(sep_obj);
12770 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012771 if (kind1 != kind)
12772 PyMem_Free(buf1);
12773 if (kind2 != kind)
12774 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012775
12776 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012777 onError:
12778 Py_DECREF(sep_obj);
12779 Py_DECREF(str_obj);
12780 if (kind1 != kind && buf1)
12781 PyMem_Free(buf1);
12782 if (kind2 != kind && buf2)
12783 PyMem_Free(buf2);
12784 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012785}
12786
12787PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012788 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012789\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012790Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012791the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012792found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012793
12794static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012795unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012796{
Victor Stinner9310abb2011-10-05 00:59:23 +020012797 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012798}
12799
12800PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012801 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012802\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012803Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012804the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012805separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012806
12807static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012808unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012809{
Victor Stinner9310abb2011-10-05 00:59:23 +020012810 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012811}
12812
Alexander Belopolsky40018472011-02-26 01:02:56 +000012813PyObject *
12814PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012815{
12816 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012817
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012818 s = PyUnicode_FromObject(s);
12819 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012820 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012821 if (sep != NULL) {
12822 sep = PyUnicode_FromObject(sep);
12823 if (sep == NULL) {
12824 Py_DECREF(s);
12825 return NULL;
12826 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012827 }
12828
Victor Stinner9310abb2011-10-05 00:59:23 +020012829 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012830
12831 Py_DECREF(s);
12832 Py_XDECREF(sep);
12833 return result;
12834}
12835
12836PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012837 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012838\n\
12839Return a list of the words in S, using sep as the\n\
12840delimiter string, starting at the end of the string and\n\
12841working to the front. If maxsplit is given, at most maxsplit\n\
12842splits are done. If sep is not specified, any whitespace string\n\
12843is a separator.");
12844
12845static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012846unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012847{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012848 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012849 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012850 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012851
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012852 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12853 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012854 return NULL;
12855
12856 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012857 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012858 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012859 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012860 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012861 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012862}
12863
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012864PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012865 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012866\n\
12867Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012868Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012869is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012870
12871static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012872unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012873{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012874 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012875 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012876
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012877 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12878 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012879 return NULL;
12880
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012881 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012882}
12883
12884static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012885PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012886{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012887 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012888}
12889
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012890PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012891 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012892\n\
12893Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012894and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012895
12896static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012897unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012898{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012899 if (PyUnicode_READY(self) == -1)
12900 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012901 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012902}
12903
Georg Brandlceee0772007-11-27 23:48:05 +000012904PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012905 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012906\n\
12907Return a translation table usable for str.translate().\n\
12908If there is only one argument, it must be a dictionary mapping Unicode\n\
12909ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012910Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012911If there are two arguments, they must be strings of equal length, and\n\
12912in the resulting dictionary, each character in x will be mapped to the\n\
12913character at the same position in y. If there is a third argument, it\n\
12914must be a string, whose characters will be mapped to None in the result.");
12915
12916static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012917unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012918{
12919 PyObject *x, *y = NULL, *z = NULL;
12920 PyObject *new = NULL, *key, *value;
12921 Py_ssize_t i = 0;
12922 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012923
Georg Brandlceee0772007-11-27 23:48:05 +000012924 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12925 return NULL;
12926 new = PyDict_New();
12927 if (!new)
12928 return NULL;
12929 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012930 int x_kind, y_kind, z_kind;
12931 void *x_data, *y_data, *z_data;
12932
Georg Brandlceee0772007-11-27 23:48:05 +000012933 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012934 if (!PyUnicode_Check(x)) {
12935 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12936 "be a string if there is a second argument");
12937 goto err;
12938 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012939 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012940 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12941 "arguments must have equal length");
12942 goto err;
12943 }
12944 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012945 x_kind = PyUnicode_KIND(x);
12946 y_kind = PyUnicode_KIND(y);
12947 x_data = PyUnicode_DATA(x);
12948 y_data = PyUnicode_DATA(y);
12949 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12950 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012951 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012952 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012953 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012954 if (!value) {
12955 Py_DECREF(key);
12956 goto err;
12957 }
Georg Brandlceee0772007-11-27 23:48:05 +000012958 res = PyDict_SetItem(new, key, value);
12959 Py_DECREF(key);
12960 Py_DECREF(value);
12961 if (res < 0)
12962 goto err;
12963 }
12964 /* create entries for deleting chars in z */
12965 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012966 z_kind = PyUnicode_KIND(z);
12967 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012968 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012969 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012970 if (!key)
12971 goto err;
12972 res = PyDict_SetItem(new, key, Py_None);
12973 Py_DECREF(key);
12974 if (res < 0)
12975 goto err;
12976 }
12977 }
12978 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012979 int kind;
12980 void *data;
12981
Georg Brandlceee0772007-11-27 23:48:05 +000012982 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012983 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012984 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12985 "to maketrans it must be a dict");
12986 goto err;
12987 }
12988 /* copy entries into the new dict, converting string keys to int keys */
12989 while (PyDict_Next(x, &i, &key, &value)) {
12990 if (PyUnicode_Check(key)) {
12991 /* convert string keys to integer keys */
12992 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012993 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012994 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12995 "table must be of length 1");
12996 goto err;
12997 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012998 kind = PyUnicode_KIND(key);
12999 data = PyUnicode_DATA(key);
13000 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013001 if (!newkey)
13002 goto err;
13003 res = PyDict_SetItem(new, newkey, value);
13004 Py_DECREF(newkey);
13005 if (res < 0)
13006 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013007 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013008 /* just keep integer keys */
13009 if (PyDict_SetItem(new, key, value) < 0)
13010 goto err;
13011 } else {
13012 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13013 "be strings or integers");
13014 goto err;
13015 }
13016 }
13017 }
13018 return new;
13019 err:
13020 Py_DECREF(new);
13021 return NULL;
13022}
13023
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013024PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013025 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013026\n\
13027Return a copy of the string S, where all characters have been mapped\n\
13028through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000013029Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000013030Unmapped characters are left untouched. Characters mapped to None\n\
13031are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013032
13033static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013034unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013035{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013036 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013037}
13038
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013039PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013040 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013041\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013042Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013043
13044static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013045unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013046{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013047 if (PyUnicode_READY(self) == -1)
13048 return NULL;
13049 if (PyUnicode_IS_ASCII(self))
13050 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013051 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013052}
13053
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013054PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013055 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013056\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000013057Pad a numeric string S with zeros on the left, to fill a field\n\
13058of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013059
13060static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020013061unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013062{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013063 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013064 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013065 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013066 int kind;
13067 void *data;
13068 Py_UCS4 chr;
13069
Martin v. Löwis18e16552006-02-15 17:27:45 +000013070 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000013071 return NULL;
13072
Benjamin Petersonbac79492012-01-14 13:34:47 -050013073 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013074 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013075
Victor Stinnerc4b49542011-12-11 22:44:26 +010013076 if (PyUnicode_GET_LENGTH(self) >= width)
13077 return unicode_result_unchanged(self);
13078
13079 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013080
13081 u = pad(self, fill, 0, '0');
13082
Walter Dörwald068325e2002-04-15 13:36:47 +000013083 if (u == NULL)
13084 return NULL;
13085
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013086 kind = PyUnicode_KIND(u);
13087 data = PyUnicode_DATA(u);
13088 chr = PyUnicode_READ(kind, data, fill);
13089
13090 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013091 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013092 PyUnicode_WRITE(kind, data, 0, chr);
13093 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013094 }
13095
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013096 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013097 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013098}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013099
13100#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013101static PyObject *
13102unicode__decimal2ascii(PyObject *self)
13103{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013104 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013105}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013106#endif
13107
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013108PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013109 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013110\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013111Return True if S starts with the specified prefix, False otherwise.\n\
13112With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013113With optional end, stop comparing S at that position.\n\
13114prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013115
13116static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013117unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013118 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013119{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013120 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013121 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013122 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013123 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013124 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013125
Jesus Ceaac451502011-04-20 17:09:23 +020013126 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013127 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013128 if (PyTuple_Check(subobj)) {
13129 Py_ssize_t i;
13130 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013131 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013132 if (substring == NULL)
13133 return NULL;
13134 result = tailmatch(self, substring, start, end, -1);
13135 Py_DECREF(substring);
13136 if (result) {
13137 Py_RETURN_TRUE;
13138 }
13139 }
13140 /* nothing matched */
13141 Py_RETURN_FALSE;
13142 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013143 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013144 if (substring == NULL) {
13145 if (PyErr_ExceptionMatches(PyExc_TypeError))
13146 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
13147 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013148 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013149 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013150 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013151 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013152 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013153}
13154
13155
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013156PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013157 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013158\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013159Return True if S ends with the specified suffix, False otherwise.\n\
13160With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013161With optional end, stop comparing S at that position.\n\
13162suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013163
13164static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013165unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013166 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013167{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013168 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013169 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013170 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013171 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013172 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013173
Jesus Ceaac451502011-04-20 17:09:23 +020013174 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013175 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013176 if (PyTuple_Check(subobj)) {
13177 Py_ssize_t i;
13178 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013179 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000013180 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013181 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013182 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013183 result = tailmatch(self, substring, start, end, +1);
13184 Py_DECREF(substring);
13185 if (result) {
13186 Py_RETURN_TRUE;
13187 }
13188 }
13189 Py_RETURN_FALSE;
13190 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013191 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013192 if (substring == NULL) {
13193 if (PyErr_ExceptionMatches(PyExc_TypeError))
13194 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
13195 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013196 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013197 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013198 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013199 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013200 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013201}
13202
Victor Stinner202fdca2012-05-07 12:47:02 +020013203typedef struct {
13204 PyObject *buffer;
13205 void *data;
13206 enum PyUnicode_Kind kind;
13207 Py_UCS4 maxchar;
13208 Py_ssize_t pos;
13209} unicode_writer_t;
13210
13211Py_LOCAL_INLINE(void)
13212unicode_writer_update(unicode_writer_t *writer)
13213{
13214 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13215 writer->data = PyUnicode_DATA(writer->buffer);
13216 writer->kind = PyUnicode_KIND(writer->buffer);
13217}
13218
13219Py_LOCAL(int)
13220unicode_writer_init(unicode_writer_t *writer,
13221 Py_ssize_t length, Py_UCS4 maxchar)
13222{
13223 writer->pos = 0;
13224 writer->buffer = PyUnicode_New(length, maxchar);
13225 if (writer->buffer == NULL)
13226 return -1;
13227 unicode_writer_update(writer);
13228 return 0;
13229}
13230
13231Py_LOCAL_INLINE(int)
13232unicode_writer_prepare(unicode_writer_t *writer,
13233 Py_ssize_t length, Py_UCS4 maxchar)
13234{
13235 Py_ssize_t newlen;
13236 PyObject *newbuffer;
13237
13238 if (length > PY_SSIZE_T_MAX - writer->pos) {
13239 PyErr_NoMemory();
13240 return -1;
13241 }
13242 newlen = writer->pos + length;
13243
13244 if (newlen > PyUnicode_GET_LENGTH(writer->buffer)) {
Victor Stinner10680252012-05-07 23:50:05 +020013245 /* overallocate 25% to limit the number of resize */
13246 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
Victor Stinner202fdca2012-05-07 12:47:02 +020013247 newlen += newlen / 4;
13248
13249 if (maxchar > writer->maxchar) {
13250 /* resize + widen */
13251 newbuffer = PyUnicode_New(newlen, maxchar);
13252 if (newbuffer == NULL)
13253 return -1;
13254 PyUnicode_CopyCharacters(newbuffer, 0,
13255 writer->buffer, 0, writer->pos);
13256 Py_DECREF(writer->buffer);
13257 }
13258 else {
13259 newbuffer = resize_compact(writer->buffer, newlen);
13260 if (newbuffer == NULL)
13261 return -1;
13262 }
13263 writer->buffer = newbuffer;
13264 unicode_writer_update(writer);
13265 }
13266 else if (maxchar > writer->maxchar) {
13267 if (unicode_widen(&writer->buffer, writer->pos, maxchar) < 0)
13268 return -1;
13269 unicode_writer_update(writer);
13270 }
13271 return 0;
13272}
13273
13274Py_LOCAL_INLINE(int)
13275unicode_writer_write_str(
13276 unicode_writer_t *writer,
13277 PyObject *str, Py_ssize_t start, Py_ssize_t length)
13278{
13279 Py_UCS4 maxchar;
13280
13281 assert(str != NULL);
13282 assert(PyUnicode_Check(str));
13283 if (PyUnicode_READY(str) == -1)
13284 return -1;
13285
13286 assert(0 <= start);
13287 assert(0 <= length);
13288 assert(start + length <= PyUnicode_GET_LENGTH(str));
13289 if (length == 0)
13290 return 0;
13291
13292 maxchar = _PyUnicode_FindMaxChar(str, start, start + length);
13293 if (unicode_writer_prepare(writer, length, maxchar) == -1)
13294 return -1;
13295
13296 assert((writer->pos + length) <= PyUnicode_GET_LENGTH(writer->buffer));
13297 copy_characters(writer->buffer, writer->pos,
13298 str, start, length);
13299 writer->pos += length;
13300 return 0;
13301}
13302
13303Py_LOCAL_INLINE(int)
13304unicode_writer_write_char(
13305 unicode_writer_t *writer,
13306 Py_UCS4 ch)
13307{
13308 if (unicode_writer_prepare(writer, 1, ch) == -1)
13309 return -1;
13310 assert((writer->pos + 1) <= PyUnicode_GET_LENGTH(writer->buffer));
13311 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13312 writer->pos += 1;
13313 return 0;
13314}
13315
13316Py_LOCAL(PyObject *)
13317unicode_writer_finish(unicode_writer_t *writer)
13318{
13319 if (PyUnicode_Resize(&writer->buffer, writer->pos) < 0) {
13320 Py_DECREF(writer->buffer);
13321 return NULL;
13322 }
13323 return writer->buffer;
13324}
13325
13326Py_LOCAL(void)
13327unicode_writer_dealloc(unicode_writer_t *writer)
13328{
13329 Py_CLEAR(writer->buffer);
13330}
13331
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013332#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013333
13334PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013335 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013336\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013337Return a formatted version of S, using substitutions from args and kwargs.\n\
13338The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013339
Eric Smith27bbca62010-11-04 17:06:58 +000013340PyDoc_STRVAR(format_map__doc__,
13341 "S.format_map(mapping) -> str\n\
13342\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013343Return a formatted version of S, using substitutions from mapping.\n\
13344The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013345
Eric Smith4a7d76d2008-05-30 18:10:19 +000013346static PyObject *
13347unicode__format__(PyObject* self, PyObject* args)
13348{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013349 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013350
13351 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13352 return NULL;
13353
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013354 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013355 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013356 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013357}
13358
Eric Smith8c663262007-08-25 02:26:07 +000013359PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013360 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013361\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013362Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013363
13364static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013365unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013366{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013367 Py_ssize_t size;
13368
13369 /* If it's a compact object, account for base structure +
13370 character data. */
13371 if (PyUnicode_IS_COMPACT_ASCII(v))
13372 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13373 else if (PyUnicode_IS_COMPACT(v))
13374 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013375 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013376 else {
13377 /* If it is a two-block object, account for base object, and
13378 for character block if present. */
13379 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013380 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013381 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013382 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013383 }
13384 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013385 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013386 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013387 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013388 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013389 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013390
13391 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013392}
13393
13394PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013395 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013396
13397static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013398unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013399{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013400 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013401 if (!copy)
13402 return NULL;
13403 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013404}
13405
Guido van Rossumd57fd912000-03-10 22:53:23 +000013406static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013407 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013408 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013409 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13410 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013411 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13412 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013413 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013414 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13415 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13416 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13417 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
13418 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013419 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013420 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13421 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13422 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013423 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013424 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13425 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13426 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013427 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013428 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010013429 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013430 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013431 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13432 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13433 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13434 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13435 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13436 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13437 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13438 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13439 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13440 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13441 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13442 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13443 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13444 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013445 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013446 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013447 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013448 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013449 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013450 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000013451 {"maketrans", (PyCFunction) unicode_maketrans,
13452 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013453 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013454#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013455 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013456 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013457#endif
13458
Benjamin Peterson14339b62009-01-31 16:36:08 +000013459 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013460 {NULL, NULL}
13461};
13462
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013463static PyObject *
13464unicode_mod(PyObject *v, PyObject *w)
13465{
Brian Curtindfc80e32011-08-10 20:28:54 -050013466 if (!PyUnicode_Check(v))
13467 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013468 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013469}
13470
13471static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013472 0, /*nb_add*/
13473 0, /*nb_subtract*/
13474 0, /*nb_multiply*/
13475 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013476};
13477
Guido van Rossumd57fd912000-03-10 22:53:23 +000013478static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013479 (lenfunc) unicode_length, /* sq_length */
13480 PyUnicode_Concat, /* sq_concat */
13481 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13482 (ssizeargfunc) unicode_getitem, /* sq_item */
13483 0, /* sq_slice */
13484 0, /* sq_ass_item */
13485 0, /* sq_ass_slice */
13486 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013487};
13488
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013489static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013490unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013491{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013492 if (PyUnicode_READY(self) == -1)
13493 return NULL;
13494
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013495 if (PyIndex_Check(item)) {
13496 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013497 if (i == -1 && PyErr_Occurred())
13498 return NULL;
13499 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013500 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013501 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013502 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013503 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013504 PyObject *result;
13505 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013506 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013507 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013508
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013509 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013510 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013511 return NULL;
13512 }
13513
13514 if (slicelength <= 0) {
Victor Stinner382955f2011-12-11 21:44:00 +010013515 Py_INCREF(unicode_empty);
13516 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013517 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013518 slicelength == PyUnicode_GET_LENGTH(self)) {
13519 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013520 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013521 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013522 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013523 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013524 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013525 src_kind = PyUnicode_KIND(self);
13526 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013527 if (!PyUnicode_IS_ASCII(self)) {
13528 kind_limit = kind_maxchar_limit(src_kind);
13529 max_char = 0;
13530 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13531 ch = PyUnicode_READ(src_kind, src_data, cur);
13532 if (ch > max_char) {
13533 max_char = ch;
13534 if (max_char >= kind_limit)
13535 break;
13536 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013537 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013538 }
Victor Stinner55c99112011-10-13 01:17:06 +020013539 else
13540 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013541 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013542 if (result == NULL)
13543 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013544 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013545 dest_data = PyUnicode_DATA(result);
13546
13547 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013548 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13549 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013550 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013551 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013552 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013553 } else {
13554 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13555 return NULL;
13556 }
13557}
13558
13559static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013560 (lenfunc)unicode_length, /* mp_length */
13561 (binaryfunc)unicode_subscript, /* mp_subscript */
13562 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013563};
13564
Guido van Rossumd57fd912000-03-10 22:53:23 +000013565
Guido van Rossumd57fd912000-03-10 22:53:23 +000013566/* Helpers for PyUnicode_Format() */
13567
13568static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000013569getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013570{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013571 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013572 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013573 (*p_argidx)++;
13574 if (arglen < 0)
13575 return args;
13576 else
13577 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013578 }
13579 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013580 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013581 return NULL;
13582}
13583
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013584/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013585
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013586static PyObject *
13587formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013588{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013589 char *p;
13590 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013591 double x;
Tim Petersced69f82003-09-16 20:30:58 +000013592
Guido van Rossumd57fd912000-03-10 22:53:23 +000013593 x = PyFloat_AsDouble(v);
13594 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013595 return NULL;
13596
Guido van Rossumd57fd912000-03-10 22:53:23 +000013597 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013598 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013599
Eric Smith0923d1d2009-04-16 20:16:10 +000013600 p = PyOS_double_to_string(x, type, prec,
13601 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013602 if (p == NULL)
13603 return NULL;
Victor Stinnerb5c3ea32012-05-02 00:29:36 +020013604 result = unicode_fromascii((unsigned char*)p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +000013605 PyMem_Free(p);
13606 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013607}
13608
Victor Stinnerd0880d52012-04-27 23:40:13 +020013609/* formatlong() emulates the format codes d, u, o, x and X, and
13610 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13611 * Python's regular ints.
13612 * Return value: a new PyUnicodeObject*, or NULL if error.
13613 * The output string is of the form
13614 * "-"? ("0x" | "0X")? digit+
13615 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13616 * set in flags. The case of hex digits will be correct,
13617 * There will be at least prec digits, zero-filled on the left if
13618 * necessary to get that many.
13619 * val object to be converted
13620 * flags bitmask of format flags; only F_ALT is looked at
13621 * prec minimum number of digits; 0-fill on left if needed
13622 * type a character in [duoxX]; u acts the same as d
13623 *
13624 * CAUTION: o, x and X conversions on regular ints can never
13625 * produce a '-' sign, but can for Python's unbounded ints.
13626 */
Tim Peters38fd5b62000-09-21 05:43:11 +000013627static PyObject*
13628formatlong(PyObject *val, int flags, int prec, int type)
13629{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013630 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013631 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013632 Py_ssize_t i;
13633 int sign; /* 1 if '-', else 0 */
13634 int len; /* number of characters */
13635 Py_ssize_t llen;
13636 int numdigits; /* len == numnondigits + numdigits */
13637 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000013638
Victor Stinnerd0880d52012-04-27 23:40:13 +020013639 /* Avoid exceeding SSIZE_T_MAX */
13640 if (prec > INT_MAX-3) {
13641 PyErr_SetString(PyExc_OverflowError,
13642 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013643 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013644 }
13645
13646 assert(PyLong_Check(val));
13647
13648 switch (type) {
13649 case 'd':
13650 case 'u':
13651 /* Special-case boolean: we want 0/1 */
Victor Stinnerb11d91d2012-04-28 00:25:34 +020013652 if (PyBool_Check(val))
13653 result = PyNumber_ToBase(val, 10);
13654 else
13655 result = Py_TYPE(val)->tp_str(val);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013656 break;
13657 case 'o':
13658 numnondigits = 2;
13659 result = PyNumber_ToBase(val, 8);
13660 break;
13661 case 'x':
13662 case 'X':
13663 numnondigits = 2;
13664 result = PyNumber_ToBase(val, 16);
13665 break;
13666 default:
13667 assert(!"'type' not in [duoxX]");
13668 }
13669 if (!result)
13670 return NULL;
13671
13672 assert(unicode_modifiable(result));
13673 assert(PyUnicode_IS_READY(result));
13674 assert(PyUnicode_IS_ASCII(result));
13675
13676 /* To modify the string in-place, there can only be one reference. */
13677 if (Py_REFCNT(result) != 1) {
13678 PyErr_BadInternalCall();
13679 return NULL;
13680 }
13681 buf = PyUnicode_DATA(result);
13682 llen = PyUnicode_GET_LENGTH(result);
13683 if (llen > INT_MAX) {
13684 PyErr_SetString(PyExc_ValueError,
13685 "string too large in _PyBytes_FormatLong");
13686 return NULL;
13687 }
13688 len = (int)llen;
13689 sign = buf[0] == '-';
13690 numnondigits += sign;
13691 numdigits = len - numnondigits;
13692 assert(numdigits > 0);
13693
13694 /* Get rid of base marker unless F_ALT */
13695 if (((flags & F_ALT) == 0 &&
13696 (type == 'o' || type == 'x' || type == 'X'))) {
13697 assert(buf[sign] == '0');
13698 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13699 buf[sign+1] == 'o');
13700 numnondigits -= 2;
13701 buf += 2;
13702 len -= 2;
13703 if (sign)
13704 buf[0] = '-';
13705 assert(len == numnondigits + numdigits);
13706 assert(numdigits > 0);
13707 }
13708
13709 /* Fill with leading zeroes to meet minimum width. */
13710 if (prec > numdigits) {
13711 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13712 numnondigits + prec);
13713 char *b1;
13714 if (!r1) {
13715 Py_DECREF(result);
13716 return NULL;
13717 }
13718 b1 = PyBytes_AS_STRING(r1);
13719 for (i = 0; i < numnondigits; ++i)
13720 *b1++ = *buf++;
13721 for (i = 0; i < prec - numdigits; i++)
13722 *b1++ = '0';
13723 for (i = 0; i < numdigits; i++)
13724 *b1++ = *buf++;
13725 *b1 = '\0';
13726 Py_DECREF(result);
13727 result = r1;
13728 buf = PyBytes_AS_STRING(result);
13729 len = numnondigits + prec;
13730 }
13731
13732 /* Fix up case for hex conversions. */
13733 if (type == 'X') {
13734 /* Need to convert all lower case letters to upper case.
13735 and need to convert 0x to 0X (and -0x to -0X). */
13736 for (i = 0; i < len; i++)
13737 if (buf[i] >= 'a' && buf[i] <= 'x')
13738 buf[i] -= 'a'-'A';
13739 }
13740 if (!PyUnicode_Check(result) || len != PyUnicode_GET_LENGTH(result)) {
13741 PyObject *unicode;
13742 unicode = unicode_fromascii((unsigned char *)buf, len);
13743 Py_DECREF(result);
13744 result = unicode;
13745 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013746 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013747}
13748
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013749static Py_UCS4
13750formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013751{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013752 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013753 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013754 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013755 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013756 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013757 goto onError;
13758 }
13759 else {
13760 /* Integer input truncated to a character */
13761 long x;
13762 x = PyLong_AsLong(v);
13763 if (x == -1 && PyErr_Occurred())
13764 goto onError;
13765
Victor Stinner8faf8212011-12-08 22:14:11 +010013766 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013767 PyErr_SetString(PyExc_OverflowError,
13768 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013769 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013770 }
13771
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013772 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013773 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013774
Benjamin Peterson29060642009-01-31 22:14:21 +000013775 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013776 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013777 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013778 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013779}
13780
Alexander Belopolsky40018472011-02-26 01:02:56 +000013781PyObject *
13782PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013783{
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013784 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013785 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013786 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013787 PyObject *temp = NULL;
13788 PyObject *second = NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013789 PyObject *uformat;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013790 void *fmt;
13791 enum PyUnicode_Kind kind, fmtkind;
Victor Stinnera7b654b2012-05-03 23:58:55 +020013792 unicode_writer_t writer;
Tim Petersced69f82003-09-16 20:30:58 +000013793
Guido van Rossumd57fd912000-03-10 22:53:23 +000013794 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013795 PyErr_BadInternalCall();
13796 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013797 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013798 uformat = PyUnicode_FromObject(format);
Benjamin Peterson22a29702012-01-02 09:00:30 -060013799 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013800 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060013801 if (PyUnicode_READY(uformat) == -1)
13802 Py_DECREF(uformat);
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013803
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013804 fmt = PyUnicode_DATA(uformat);
13805 fmtkind = PyUnicode_KIND(uformat);
13806 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13807 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013808
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013809 if (unicode_writer_init(&writer, fmtcnt + 100, 127) < 0)
13810 goto onError;
13811
Guido van Rossumd57fd912000-03-10 22:53:23 +000013812 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013813 arglen = PyTuple_Size(args);
13814 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013815 }
13816 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013817 arglen = -1;
13818 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013819 }
Christian Heimes90aa7642007-12-19 02:45:37 +000013820 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000013821 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013822 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013823
13824 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013825 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013826 Py_ssize_t nonfmtpos;
13827 nonfmtpos = fmtpos++;
13828 while (fmtcnt >= 0 &&
13829 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13830 fmtpos++;
13831 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013832 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013833 if (fmtcnt < 0)
13834 fmtpos--;
13835 if (unicode_writer_write_str(&writer, uformat, nonfmtpos, fmtpos - nonfmtpos) < 0)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013836 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013837 }
13838 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013839 /* Got a format specifier */
13840 int flags = 0;
13841 Py_ssize_t width = -1;
13842 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013843 Py_UCS4 c = '\0';
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013844 Py_UCS4 fill;
13845 int sign;
13846 Py_UCS4 signchar;
Benjamin Peterson29060642009-01-31 22:14:21 +000013847 int isnumok;
13848 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013849 void *pbuf = NULL;
13850 Py_ssize_t pindex, len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013851
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013852 fmtpos++;
Victor Stinner438106b2012-05-02 00:41:57 +020013853 c = PyUnicode_READ(fmtkind, fmt, fmtpos);
13854 if (c == '(') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013855 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013856 Py_ssize_t keylen;
13857 PyObject *key;
13858 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013859
Benjamin Peterson29060642009-01-31 22:14:21 +000013860 if (dict == NULL) {
13861 PyErr_SetString(PyExc_TypeError,
13862 "format requires a mapping");
13863 goto onError;
13864 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013865 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013866 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013867 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013868 /* Skip over balanced parentheses */
13869 while (pcount > 0 && --fmtcnt >= 0) {
Victor Stinnerbff7c962012-05-03 01:44:59 +020013870 c = PyUnicode_READ(fmtkind, fmt, fmtpos);
13871 if (c == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013872 --pcount;
Victor Stinnerbff7c962012-05-03 01:44:59 +020013873 else if (c == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013874 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013875 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013876 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013877 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013878 if (fmtcnt < 0 || pcount > 0) {
13879 PyErr_SetString(PyExc_ValueError,
13880 "incomplete format key");
13881 goto onError;
13882 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013883 key = PyUnicode_Substring(uformat,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013884 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013885 if (key == NULL)
13886 goto onError;
13887 if (args_owned) {
13888 Py_DECREF(args);
13889 args_owned = 0;
13890 }
13891 args = PyObject_GetItem(dict, key);
13892 Py_DECREF(key);
13893 if (args == NULL) {
13894 goto onError;
13895 }
13896 args_owned = 1;
13897 arglen = -1;
13898 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013899 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013900 while (--fmtcnt >= 0) {
Victor Stinner438106b2012-05-02 00:41:57 +020013901 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13902 switch (c) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013903 case '-': flags |= F_LJUST; continue;
13904 case '+': flags |= F_SIGN; continue;
13905 case ' ': flags |= F_BLANK; continue;
13906 case '#': flags |= F_ALT; continue;
13907 case '0': flags |= F_ZERO; continue;
13908 }
13909 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013910 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013911 if (c == '*') {
13912 v = getnextarg(args, arglen, &argidx);
13913 if (v == NULL)
13914 goto onError;
13915 if (!PyLong_Check(v)) {
13916 PyErr_SetString(PyExc_TypeError,
13917 "* wants int");
13918 goto onError;
13919 }
13920 width = PyLong_AsLong(v);
13921 if (width == -1 && PyErr_Occurred())
13922 goto onError;
13923 if (width < 0) {
13924 flags |= F_LJUST;
13925 width = -width;
13926 }
13927 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013928 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013929 }
13930 else if (c >= '0' && c <= '9') {
13931 width = c - '0';
13932 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013933 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013934 if (c < '0' || c > '9')
13935 break;
Mark Dickinson99e2e552012-05-07 11:20:50 +010013936 if (width > (PY_SSIZE_T_MAX - (c - '0')) / 10) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013937 PyErr_SetString(PyExc_ValueError,
13938 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013939 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013940 }
13941 width = width*10 + (c - '0');
13942 }
13943 }
13944 if (c == '.') {
13945 prec = 0;
13946 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013947 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013948 if (c == '*') {
13949 v = getnextarg(args, arglen, &argidx);
13950 if (v == NULL)
13951 goto onError;
13952 if (!PyLong_Check(v)) {
13953 PyErr_SetString(PyExc_TypeError,
13954 "* wants int");
13955 goto onError;
13956 }
13957 prec = PyLong_AsLong(v);
13958 if (prec == -1 && PyErr_Occurred())
13959 goto onError;
13960 if (prec < 0)
13961 prec = 0;
13962 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013963 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013964 }
13965 else if (c >= '0' && c <= '9') {
13966 prec = c - '0';
13967 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013968 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013969 if (c < '0' || c > '9')
13970 break;
Mark Dickinson99e2e552012-05-07 11:20:50 +010013971 if (prec > (INT_MAX - (c - '0')) / 10) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013972 PyErr_SetString(PyExc_ValueError,
13973 "prec too big");
13974 goto onError;
13975 }
13976 prec = prec*10 + (c - '0');
13977 }
13978 }
13979 } /* prec */
13980 if (fmtcnt >= 0) {
13981 if (c == 'h' || c == 'l' || c == 'L') {
13982 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013983 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013984 }
13985 }
13986 if (fmtcnt < 0) {
13987 PyErr_SetString(PyExc_ValueError,
13988 "incomplete format");
13989 goto onError;
13990 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020013991
13992 if (c == '%') {
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013993 if (unicode_writer_write_char(&writer, '%') < 0)
13994 goto onError;
Victor Stinneraff3cc62012-04-30 05:19:21 +020013995 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013996 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020013997
13998
13999 v = getnextarg(args, arglen, &argidx);
14000 if (v == NULL)
14001 goto onError;
14002
Benjamin Peterson29060642009-01-31 22:14:21 +000014003 sign = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014004 signchar = '\0';
Benjamin Peterson29060642009-01-31 22:14:21 +000014005 fill = ' ';
14006 switch (c) {
14007
Benjamin Peterson29060642009-01-31 22:14:21 +000014008 case 's':
14009 case 'r':
14010 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000014011 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000014012 temp = v;
14013 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014014 }
14015 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000014016 if (c == 's')
14017 temp = PyObject_Str(v);
14018 else if (c == 'r')
14019 temp = PyObject_Repr(v);
14020 else
14021 temp = PyObject_ASCII(v);
Benjamin Peterson29060642009-01-31 22:14:21 +000014022 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014023 break;
14024
14025 case 'i':
14026 case 'd':
14027 case 'u':
14028 case 'o':
14029 case 'x':
14030 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000014031 isnumok = 0;
14032 if (PyNumber_Check(v)) {
14033 PyObject *iobj=NULL;
14034
14035 if (PyLong_Check(v)) {
14036 iobj = v;
14037 Py_INCREF(iobj);
14038 }
14039 else {
14040 iobj = PyNumber_Long(v);
14041 }
14042 if (iobj!=NULL) {
14043 if (PyLong_Check(iobj)) {
14044 isnumok = 1;
Victor Stinneraff3cc62012-04-30 05:19:21 +020014045 sign = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070014046 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000014047 Py_DECREF(iobj);
Benjamin Peterson29060642009-01-31 22:14:21 +000014048 }
14049 else {
14050 Py_DECREF(iobj);
14051 }
14052 }
14053 }
14054 if (!isnumok) {
14055 PyErr_Format(PyExc_TypeError,
14056 "%%%c format: a number is required, "
14057 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
14058 goto onError;
14059 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014060 if (flags & F_ZERO)
Benjamin Peterson29060642009-01-31 22:14:21 +000014061 fill = '0';
14062 break;
14063
14064 case 'e':
14065 case 'E':
14066 case 'f':
14067 case 'F':
14068 case 'g':
14069 case 'G':
Benjamin Peterson29060642009-01-31 22:14:21 +000014070 sign = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014071 if (flags & F_ZERO)
Benjamin Peterson29060642009-01-31 22:14:21 +000014072 fill = '0';
Victor Stinneraff3cc62012-04-30 05:19:21 +020014073 temp = formatfloat(v, flags, prec, c);
Benjamin Peterson29060642009-01-31 22:14:21 +000014074 break;
14075
14076 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014077 {
14078 Py_UCS4 ch = formatchar(v);
14079 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014080 goto onError;
Victor Stinnerb5c3ea32012-05-02 00:29:36 +020014081 temp = PyUnicode_FromOrdinal(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +000014082 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014083 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014084
14085 default:
14086 PyErr_Format(PyExc_ValueError,
14087 "unsupported format character '%c' (0x%x) "
14088 "at index %zd",
14089 (31<=c && c<=126) ? (char)c : '?',
14090 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014091 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000014092 goto onError;
14093 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014094 if (temp == NULL)
14095 goto onError;
14096 assert (PyUnicode_Check(temp));
14097 if (PyUnicode_READY(temp) == -1) {
14098 Py_CLEAR(temp);
14099 goto onError;
14100 }
14101 kind = PyUnicode_KIND(temp);
14102 pbuf = PyUnicode_DATA(temp);
14103 len = PyUnicode_GET_LENGTH(temp);
14104
14105 if (c == 's' || c == 'r' || c == 'a') {
14106 if (prec >= 0 && len > prec)
14107 len = prec;
14108 }
14109
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014110 /* pbuf is initialized here. */
14111 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000014112 if (sign) {
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014113 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14114 if (ch == '-' || ch == '+') {
14115 signchar = ch;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014116 len--;
14117 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000014118 }
14119 else if (flags & F_SIGN)
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014120 signchar = '+';
Benjamin Peterson29060642009-01-31 22:14:21 +000014121 else if (flags & F_BLANK)
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014122 signchar = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +000014123 else
14124 sign = 0;
14125 }
14126 if (width < len)
14127 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000014128 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014129 if (fill != ' ') {
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014130 if (unicode_writer_write_char(&writer, signchar) < 0)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014131 goto onError;
14132 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014133 if (width > len)
14134 width--;
14135 }
14136 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014137 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014138 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000014139 if (fill != ' ') {
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014140 if (unicode_writer_prepare(&writer, 2, 127) < 0)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014141 goto onError;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014142 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '0');
14143 PyUnicode_WRITE(writer.kind, writer.data, writer.pos+1, c);
14144 writer.pos += 2;
14145 pindex += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +000014146 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014147 width -= 2;
14148 if (width < 0)
14149 width = 0;
14150 len -= 2;
14151 }
14152 if (width > len && !(flags & F_LJUST)) {
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014153 Py_ssize_t sublen;
14154 sublen = width - len;
14155 if (unicode_writer_prepare(&writer, sublen, fill) < 0)
Antoine Pitrou978b9d22011-10-07 12:35:48 +020014156 goto onError;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014157 FILL(writer.kind, writer.data, fill, writer.pos, sublen);
14158 writer.pos += sublen;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020014159 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000014160 }
14161 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014162 if (sign) {
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014163 if (unicode_writer_write_char(&writer, signchar) < 0)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014164 goto onError;
14165 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014166 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014167 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14168 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014169
14170 if (unicode_writer_prepare(&writer, 2, 127) < 0)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014171 goto onError;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014172 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '0');
14173 PyUnicode_WRITE(writer.kind, writer.data, writer.pos+1, c);
14174 writer.pos += 2;
14175
14176 pindex += 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014177 }
14178 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014179
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014180 /* Copy all characters, preserving len */
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014181 if (unicode_writer_write_str(&writer, temp, pindex, len) < 0)
14182 goto onError;
14183 if (width > len) {
14184 Py_ssize_t sublen = width - len;
14185 if (unicode_writer_prepare(&writer, sublen, ' ') < 0)
Victor Stinnerb80e46e2012-04-30 05:21:52 +020014186 goto onError;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014187 FILL(writer.kind, writer.data, ' ', writer.pos, sublen);
14188 writer.pos += sublen;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014189 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014190 if (dict && (argidx < arglen) && c != '%') {
14191 PyErr_SetString(PyExc_TypeError,
14192 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000014193 goto onError;
14194 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014195 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000014196 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014197 } /* until end */
14198 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014199 PyErr_SetString(PyExc_TypeError,
14200 "not all arguments converted during string formatting");
14201 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014202 }
14203
14204 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014205 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014206 }
14207 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014208 Py_XDECREF(temp);
14209 Py_XDECREF(second);
Victor Stinnera7b654b2012-05-03 23:58:55 +020014210 return unicode_writer_finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014211
Benjamin Peterson29060642009-01-31 22:14:21 +000014212 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014213 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014214 Py_XDECREF(temp);
14215 Py_XDECREF(second);
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014216 unicode_writer_dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014217 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014218 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014219 }
14220 return NULL;
14221}
14222
Jeremy Hylton938ace62002-07-17 16:30:39 +000014223static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014224unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14225
Tim Peters6d6c1a32001-08-02 04:15:00 +000014226static PyObject *
14227unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14228{
Benjamin Peterson29060642009-01-31 22:14:21 +000014229 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014230 static char *kwlist[] = {"object", "encoding", "errors", 0};
14231 char *encoding = NULL;
14232 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014233
Benjamin Peterson14339b62009-01-31 16:36:08 +000014234 if (type != &PyUnicode_Type)
14235 return unicode_subtype_new(type, args, kwds);
14236 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014237 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014238 return NULL;
Victor Stinner382955f2011-12-11 21:44:00 +010014239 if (x == NULL) {
14240 Py_INCREF(unicode_empty);
14241 return unicode_empty;
14242 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014243 if (encoding == NULL && errors == NULL)
14244 return PyObject_Str(x);
14245 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014246 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014247}
14248
Guido van Rossume023fe02001-08-30 03:12:59 +000014249static PyObject *
14250unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14251{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014252 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014253 Py_ssize_t length, char_size;
14254 int share_wstr, share_utf8;
14255 unsigned int kind;
14256 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014257
Benjamin Peterson14339b62009-01-31 16:36:08 +000014258 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014259
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014260 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014261 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014262 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014263 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014264 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014265 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014266 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014267 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014268
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014269 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014270 if (self == NULL) {
14271 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014272 return NULL;
14273 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014274 kind = PyUnicode_KIND(unicode);
14275 length = PyUnicode_GET_LENGTH(unicode);
14276
14277 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014278#ifdef Py_DEBUG
14279 _PyUnicode_HASH(self) = -1;
14280#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014281 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014282#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014283 _PyUnicode_STATE(self).interned = 0;
14284 _PyUnicode_STATE(self).kind = kind;
14285 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014286 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014287 _PyUnicode_STATE(self).ready = 1;
14288 _PyUnicode_WSTR(self) = NULL;
14289 _PyUnicode_UTF8_LENGTH(self) = 0;
14290 _PyUnicode_UTF8(self) = NULL;
14291 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014292 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014293
14294 share_utf8 = 0;
14295 share_wstr = 0;
14296 if (kind == PyUnicode_1BYTE_KIND) {
14297 char_size = 1;
14298 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14299 share_utf8 = 1;
14300 }
14301 else if (kind == PyUnicode_2BYTE_KIND) {
14302 char_size = 2;
14303 if (sizeof(wchar_t) == 2)
14304 share_wstr = 1;
14305 }
14306 else {
14307 assert(kind == PyUnicode_4BYTE_KIND);
14308 char_size = 4;
14309 if (sizeof(wchar_t) == 4)
14310 share_wstr = 1;
14311 }
14312
14313 /* Ensure we won't overflow the length. */
14314 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14315 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014316 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014317 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014318 data = PyObject_MALLOC((length + 1) * char_size);
14319 if (data == NULL) {
14320 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014321 goto onError;
14322 }
14323
Victor Stinnerc3c74152011-10-02 20:39:55 +020014324 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014325 if (share_utf8) {
14326 _PyUnicode_UTF8_LENGTH(self) = length;
14327 _PyUnicode_UTF8(self) = data;
14328 }
14329 if (share_wstr) {
14330 _PyUnicode_WSTR_LENGTH(self) = length;
14331 _PyUnicode_WSTR(self) = (wchar_t *)data;
14332 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014333
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014334 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014335 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014336 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014337#ifdef Py_DEBUG
14338 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14339#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014340 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014341 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014342
14343onError:
14344 Py_DECREF(unicode);
14345 Py_DECREF(self);
14346 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014347}
14348
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014349PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000014350 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014351\n\
Collin Winterd474ce82007-08-07 19:42:11 +000014352Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000014353encoding defaults to the current default string encoding.\n\
14354errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014355
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014356static PyObject *unicode_iter(PyObject *seq);
14357
Guido van Rossumd57fd912000-03-10 22:53:23 +000014358PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014359 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014360 "str", /* tp_name */
14361 sizeof(PyUnicodeObject), /* tp_size */
14362 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014363 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014364 (destructor)unicode_dealloc, /* tp_dealloc */
14365 0, /* tp_print */
14366 0, /* tp_getattr */
14367 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014368 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014369 unicode_repr, /* tp_repr */
14370 &unicode_as_number, /* tp_as_number */
14371 &unicode_as_sequence, /* tp_as_sequence */
14372 &unicode_as_mapping, /* tp_as_mapping */
14373 (hashfunc) unicode_hash, /* tp_hash*/
14374 0, /* tp_call*/
14375 (reprfunc) unicode_str, /* tp_str */
14376 PyObject_GenericGetAttr, /* tp_getattro */
14377 0, /* tp_setattro */
14378 0, /* tp_as_buffer */
14379 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014380 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014381 unicode_doc, /* tp_doc */
14382 0, /* tp_traverse */
14383 0, /* tp_clear */
14384 PyUnicode_RichCompare, /* tp_richcompare */
14385 0, /* tp_weaklistoffset */
14386 unicode_iter, /* tp_iter */
14387 0, /* tp_iternext */
14388 unicode_methods, /* tp_methods */
14389 0, /* tp_members */
14390 0, /* tp_getset */
14391 &PyBaseObject_Type, /* tp_base */
14392 0, /* tp_dict */
14393 0, /* tp_descr_get */
14394 0, /* tp_descr_set */
14395 0, /* tp_dictoffset */
14396 0, /* tp_init */
14397 0, /* tp_alloc */
14398 unicode_new, /* tp_new */
14399 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014400};
14401
14402/* Initialize the Unicode implementation */
14403
Victor Stinner3a50e702011-10-18 21:21:00 +020014404int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014405{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014406 int i;
14407
Thomas Wouters477c8d52006-05-27 19:21:47 +000014408 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014409 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014410 0x000A, /* LINE FEED */
14411 0x000D, /* CARRIAGE RETURN */
14412 0x001C, /* FILE SEPARATOR */
14413 0x001D, /* GROUP SEPARATOR */
14414 0x001E, /* RECORD SEPARATOR */
14415 0x0085, /* NEXT LINE */
14416 0x2028, /* LINE SEPARATOR */
14417 0x2029, /* PARAGRAPH SEPARATOR */
14418 };
14419
Fred Drakee4315f52000-05-09 19:53:39 +000014420 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020014421 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014422 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014423 Py_FatalError("Can't create empty string");
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010014424 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014425
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014426 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000014427 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000014428 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014429 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014430
14431 /* initialize the linebreak bloom filter */
14432 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014433 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014434 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014435
14436 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020014437
14438#ifdef HAVE_MBCS
14439 winver.dwOSVersionInfoSize = sizeof(winver);
14440 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14441 PyErr_SetFromWindowsErr(0);
14442 return -1;
14443 }
14444#endif
14445 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014446}
14447
14448/* Finalize the Unicode implementation */
14449
Christian Heimesa156e092008-02-16 07:38:31 +000014450int
14451PyUnicode_ClearFreeList(void)
14452{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014453 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014454}
14455
Guido van Rossumd57fd912000-03-10 22:53:23 +000014456void
Thomas Wouters78890102000-07-22 19:25:51 +000014457_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014458{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014459 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014460
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000014461 Py_XDECREF(unicode_empty);
14462 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014463
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014464 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014465 if (unicode_latin1[i]) {
14466 Py_DECREF(unicode_latin1[i]);
14467 unicode_latin1[i] = NULL;
14468 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014469 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014470 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014471 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014472}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014473
Walter Dörwald16807132007-05-25 13:52:07 +000014474void
14475PyUnicode_InternInPlace(PyObject **p)
14476{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014477 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014478 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014479#ifdef Py_DEBUG
14480 assert(s != NULL);
14481 assert(_PyUnicode_CHECK(s));
14482#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014483 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014484 return;
14485#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014486 /* If it's a subclass, we don't really know what putting
14487 it in the interned dict might do. */
14488 if (!PyUnicode_CheckExact(s))
14489 return;
14490 if (PyUnicode_CHECK_INTERNED(s))
14491 return;
14492 if (interned == NULL) {
14493 interned = PyDict_New();
14494 if (interned == NULL) {
14495 PyErr_Clear(); /* Don't leave an exception */
14496 return;
14497 }
14498 }
14499 /* It might be that the GetItem call fails even
14500 though the key is present in the dictionary,
14501 namely when this happens during a stack overflow. */
14502 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014503 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014504 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014505
Benjamin Peterson29060642009-01-31 22:14:21 +000014506 if (t) {
14507 Py_INCREF(t);
14508 Py_DECREF(*p);
14509 *p = t;
14510 return;
14511 }
Walter Dörwald16807132007-05-25 13:52:07 +000014512
Benjamin Peterson14339b62009-01-31 16:36:08 +000014513 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010014514 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014515 PyErr_Clear();
14516 PyThreadState_GET()->recursion_critical = 0;
14517 return;
14518 }
14519 PyThreadState_GET()->recursion_critical = 0;
14520 /* The two references in interned are not counted by refcnt.
14521 The deallocator will take care of this */
14522 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014523 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014524}
14525
14526void
14527PyUnicode_InternImmortal(PyObject **p)
14528{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014529 PyUnicode_InternInPlace(p);
14530 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014531 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014532 Py_INCREF(*p);
14533 }
Walter Dörwald16807132007-05-25 13:52:07 +000014534}
14535
14536PyObject *
14537PyUnicode_InternFromString(const char *cp)
14538{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014539 PyObject *s = PyUnicode_FromString(cp);
14540 if (s == NULL)
14541 return NULL;
14542 PyUnicode_InternInPlace(&s);
14543 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014544}
14545
Alexander Belopolsky40018472011-02-26 01:02:56 +000014546void
14547_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014548{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014549 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014550 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014551 Py_ssize_t i, n;
14552 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014553
Benjamin Peterson14339b62009-01-31 16:36:08 +000014554 if (interned == NULL || !PyDict_Check(interned))
14555 return;
14556 keys = PyDict_Keys(interned);
14557 if (keys == NULL || !PyList_Check(keys)) {
14558 PyErr_Clear();
14559 return;
14560 }
Walter Dörwald16807132007-05-25 13:52:07 +000014561
Benjamin Peterson14339b62009-01-31 16:36:08 +000014562 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14563 detector, interned unicode strings are not forcibly deallocated;
14564 rather, we give them their stolen references back, and then clear
14565 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014566
Benjamin Peterson14339b62009-01-31 16:36:08 +000014567 n = PyList_GET_SIZE(keys);
14568 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014569 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014570 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014571 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014572 if (PyUnicode_READY(s) == -1) {
14573 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014574 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014575 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014576 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014577 case SSTATE_NOT_INTERNED:
14578 /* XXX Shouldn't happen */
14579 break;
14580 case SSTATE_INTERNED_IMMORTAL:
14581 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014582 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014583 break;
14584 case SSTATE_INTERNED_MORTAL:
14585 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014586 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014587 break;
14588 default:
14589 Py_FatalError("Inconsistent interned string state.");
14590 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014591 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014592 }
14593 fprintf(stderr, "total size of all interned strings: "
14594 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14595 "mortal/immortal\n", mortal_size, immortal_size);
14596 Py_DECREF(keys);
14597 PyDict_Clear(interned);
14598 Py_DECREF(interned);
14599 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000014600}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014601
14602
14603/********************* Unicode Iterator **************************/
14604
14605typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014606 PyObject_HEAD
14607 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014608 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014609} unicodeiterobject;
14610
14611static void
14612unicodeiter_dealloc(unicodeiterobject *it)
14613{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014614 _PyObject_GC_UNTRACK(it);
14615 Py_XDECREF(it->it_seq);
14616 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014617}
14618
14619static int
14620unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14621{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014622 Py_VISIT(it->it_seq);
14623 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014624}
14625
14626static PyObject *
14627unicodeiter_next(unicodeiterobject *it)
14628{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014629 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014630
Benjamin Peterson14339b62009-01-31 16:36:08 +000014631 assert(it != NULL);
14632 seq = it->it_seq;
14633 if (seq == NULL)
14634 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014635 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014636
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014637 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14638 int kind = PyUnicode_KIND(seq);
14639 void *data = PyUnicode_DATA(seq);
14640 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14641 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014642 if (item != NULL)
14643 ++it->it_index;
14644 return item;
14645 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014646
Benjamin Peterson14339b62009-01-31 16:36:08 +000014647 Py_DECREF(seq);
14648 it->it_seq = NULL;
14649 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014650}
14651
14652static PyObject *
14653unicodeiter_len(unicodeiterobject *it)
14654{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014655 Py_ssize_t len = 0;
14656 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014657 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014658 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014659}
14660
14661PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14662
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014663static PyObject *
14664unicodeiter_reduce(unicodeiterobject *it)
14665{
14666 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020014667 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014668 it->it_seq, it->it_index);
14669 } else {
14670 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
14671 if (u == NULL)
14672 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020014673 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014674 }
14675}
14676
14677PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14678
14679static PyObject *
14680unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
14681{
14682 Py_ssize_t index = PyLong_AsSsize_t(state);
14683 if (index == -1 && PyErr_Occurred())
14684 return NULL;
14685 if (index < 0)
14686 index = 0;
14687 it->it_index = index;
14688 Py_RETURN_NONE;
14689}
14690
14691PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14692
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014693static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014694 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014695 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014696 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
14697 reduce_doc},
14698 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
14699 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014700 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014701};
14702
14703PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014704 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14705 "str_iterator", /* tp_name */
14706 sizeof(unicodeiterobject), /* tp_basicsize */
14707 0, /* tp_itemsize */
14708 /* methods */
14709 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14710 0, /* tp_print */
14711 0, /* tp_getattr */
14712 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014713 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014714 0, /* tp_repr */
14715 0, /* tp_as_number */
14716 0, /* tp_as_sequence */
14717 0, /* tp_as_mapping */
14718 0, /* tp_hash */
14719 0, /* tp_call */
14720 0, /* tp_str */
14721 PyObject_GenericGetAttr, /* tp_getattro */
14722 0, /* tp_setattro */
14723 0, /* tp_as_buffer */
14724 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14725 0, /* tp_doc */
14726 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14727 0, /* tp_clear */
14728 0, /* tp_richcompare */
14729 0, /* tp_weaklistoffset */
14730 PyObject_SelfIter, /* tp_iter */
14731 (iternextfunc)unicodeiter_next, /* tp_iternext */
14732 unicodeiter_methods, /* tp_methods */
14733 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014734};
14735
14736static PyObject *
14737unicode_iter(PyObject *seq)
14738{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014739 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014740
Benjamin Peterson14339b62009-01-31 16:36:08 +000014741 if (!PyUnicode_Check(seq)) {
14742 PyErr_BadInternalCall();
14743 return NULL;
14744 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014745 if (PyUnicode_READY(seq) == -1)
14746 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014747 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14748 if (it == NULL)
14749 return NULL;
14750 it->it_index = 0;
14751 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014752 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014753 _PyObject_GC_TRACK(it);
14754 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014755}
14756
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014757
14758size_t
14759Py_UNICODE_strlen(const Py_UNICODE *u)
14760{
14761 int res = 0;
14762 while(*u++)
14763 res++;
14764 return res;
14765}
14766
14767Py_UNICODE*
14768Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14769{
14770 Py_UNICODE *u = s1;
14771 while ((*u++ = *s2++));
14772 return s1;
14773}
14774
14775Py_UNICODE*
14776Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14777{
14778 Py_UNICODE *u = s1;
14779 while ((*u++ = *s2++))
14780 if (n-- == 0)
14781 break;
14782 return s1;
14783}
14784
14785Py_UNICODE*
14786Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14787{
14788 Py_UNICODE *u1 = s1;
14789 u1 += Py_UNICODE_strlen(u1);
14790 Py_UNICODE_strcpy(u1, s2);
14791 return s1;
14792}
14793
14794int
14795Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14796{
14797 while (*s1 && *s2 && *s1 == *s2)
14798 s1++, s2++;
14799 if (*s1 && *s2)
14800 return (*s1 < *s2) ? -1 : +1;
14801 if (*s1)
14802 return 1;
14803 if (*s2)
14804 return -1;
14805 return 0;
14806}
14807
14808int
14809Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14810{
14811 register Py_UNICODE u1, u2;
14812 for (; n != 0; n--) {
14813 u1 = *s1;
14814 u2 = *s2;
14815 if (u1 != u2)
14816 return (u1 < u2) ? -1 : +1;
14817 if (u1 == '\0')
14818 return 0;
14819 s1++;
14820 s2++;
14821 }
14822 return 0;
14823}
14824
14825Py_UNICODE*
14826Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14827{
14828 const Py_UNICODE *p;
14829 for (p = s; *p; p++)
14830 if (*p == c)
14831 return (Py_UNICODE*)p;
14832 return NULL;
14833}
14834
14835Py_UNICODE*
14836Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14837{
14838 const Py_UNICODE *p;
14839 p = s + Py_UNICODE_strlen(s);
14840 while (p != s) {
14841 p--;
14842 if (*p == c)
14843 return (Py_UNICODE*)p;
14844 }
14845 return NULL;
14846}
Victor Stinner331ea922010-08-10 16:37:20 +000014847
Victor Stinner71133ff2010-09-01 23:43:53 +000014848Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014849PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014850{
Victor Stinner577db2c2011-10-11 22:12:48 +020014851 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014852 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014853
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014854 if (!PyUnicode_Check(unicode)) {
14855 PyErr_BadArgument();
14856 return NULL;
14857 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014858 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014859 if (u == NULL)
14860 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014861 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014862 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014863 PyErr_NoMemory();
14864 return NULL;
14865 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014866 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014867 size *= sizeof(Py_UNICODE);
14868 copy = PyMem_Malloc(size);
14869 if (copy == NULL) {
14870 PyErr_NoMemory();
14871 return NULL;
14872 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014873 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014874 return copy;
14875}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014876
Georg Brandl66c221e2010-10-14 07:04:07 +000014877/* A _string module, to export formatter_parser and formatter_field_name_split
14878 to the string.Formatter class implemented in Python. */
14879
14880static PyMethodDef _string_methods[] = {
14881 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14882 METH_O, PyDoc_STR("split the argument as a field name")},
14883 {"formatter_parser", (PyCFunction) formatter_parser,
14884 METH_O, PyDoc_STR("parse the argument as a format string")},
14885 {NULL, NULL}
14886};
14887
14888static struct PyModuleDef _string_module = {
14889 PyModuleDef_HEAD_INIT,
14890 "_string",
14891 PyDoc_STR("string helper module"),
14892 0,
14893 _string_methods,
14894 NULL,
14895 NULL,
14896 NULL,
14897 NULL
14898};
14899
14900PyMODINIT_FUNC
14901PyInit__string(void)
14902{
14903 return PyModule_Create(&_string_module);
14904}
14905
14906
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014907#ifdef __cplusplus
14908}
14909#endif