blob: 2c308bcc9b2b74057754530ec4a486dd2290dc13 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Endianness switches; defaults to little endian */
51
52#ifdef WORDS_BIGENDIAN
53# define BYTEORDER_IS_BIG_ENDIAN
54#else
55# define BYTEORDER_IS_LITTLE_ENDIAN
56#endif
57
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000058/* --- Globals ------------------------------------------------------------
59
60 The globals are initialized by the _PyUnicode_Init() API and should
61 not be used before calling that API.
62
63*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000064
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000065
66#ifdef __cplusplus
67extern "C" {
68#endif
69
Victor Stinner8faf8212011-12-08 22:14:11 +010070/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
71#define MAX_UNICODE 0x10ffff
72
Victor Stinner910337b2011-10-03 03:20:16 +020073#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020074# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020075#else
76# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
77#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020078
Victor Stinnere90fe6a2011-10-01 16:48:13 +020079#define _PyUnicode_UTF8(op) \
80 (((PyCompactUnicodeObject*)(op))->utf8)
81#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020082 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020083 assert(PyUnicode_IS_READY(op)), \
84 PyUnicode_IS_COMPACT_ASCII(op) ? \
85 ((char*)((PyASCIIObject*)(op) + 1)) : \
86 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020087#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020088 (((PyCompactUnicodeObject*)(op))->utf8_length)
89#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020090 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020091 assert(PyUnicode_IS_READY(op)), \
92 PyUnicode_IS_COMPACT_ASCII(op) ? \
93 ((PyASCIIObject*)(op))->length : \
94 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020095#define _PyUnicode_WSTR(op) \
96 (((PyASCIIObject*)(op))->wstr)
97#define _PyUnicode_WSTR_LENGTH(op) \
98 (((PyCompactUnicodeObject*)(op))->wstr_length)
99#define _PyUnicode_LENGTH(op) \
100 (((PyASCIIObject *)(op))->length)
101#define _PyUnicode_STATE(op) \
102 (((PyASCIIObject *)(op))->state)
103#define _PyUnicode_HASH(op) \
104 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200105#define _PyUnicode_KIND(op) \
106 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200107 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200108#define _PyUnicode_GET_LENGTH(op) \
109 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200110 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200111#define _PyUnicode_DATA_ANY(op) \
112 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200113
Victor Stinnere6abb482012-05-02 01:15:40 +0200114/* Optimized version of Py_MAX() to compute the maximum character:
115 use it when your are computing the second argument of PyUnicode_New() */
116#define MAX_MAXCHAR(maxchar1, maxchar2) \
117 ((maxchar1) | (maxchar2))
118
Victor Stinner910337b2011-10-03 03:20:16 +0200119#undef PyUnicode_READY
120#define PyUnicode_READY(op) \
121 (assert(_PyUnicode_CHECK(op)), \
122 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200123 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100124 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200125
Victor Stinnerc379ead2011-10-03 12:52:27 +0200126#define _PyUnicode_SHARE_UTF8(op) \
127 (assert(_PyUnicode_CHECK(op)), \
128 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
129 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
130#define _PyUnicode_SHARE_WSTR(op) \
131 (assert(_PyUnicode_CHECK(op)), \
132 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
133
Victor Stinner829c0ad2011-10-03 01:08:02 +0200134/* true if the Unicode object has an allocated UTF-8 memory block
135 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200136#define _PyUnicode_HAS_UTF8_MEMORY(op) \
137 (assert(_PyUnicode_CHECK(op)), \
138 (!PyUnicode_IS_COMPACT_ASCII(op) \
139 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200140 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
141
Victor Stinner03490912011-10-03 23:45:12 +0200142/* true if the Unicode object has an allocated wstr memory block
143 (not shared with other data) */
144#define _PyUnicode_HAS_WSTR_MEMORY(op) \
145 (assert(_PyUnicode_CHECK(op)), \
146 (_PyUnicode_WSTR(op) && \
147 (!PyUnicode_IS_READY(op) || \
148 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
149
Victor Stinner910337b2011-10-03 03:20:16 +0200150/* Generic helper macro to convert characters of different types.
151 from_type and to_type have to be valid type names, begin and end
152 are pointers to the source characters which should be of type
153 "from_type *". to is a pointer of type "to_type *" and points to the
154 buffer where the result characters are written to. */
155#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
156 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200157 to_type *_to = (to_type *) to; \
158 const from_type *_iter = (begin); \
159 const from_type *_end = (end); \
160 Py_ssize_t n = (_end) - (_iter); \
161 const from_type *_unrolled_end = \
162 _iter + (n & ~ (Py_ssize_t) 3); \
163 while (_iter < (_unrolled_end)) { \
164 _to[0] = (to_type) _iter[0]; \
165 _to[1] = (to_type) _iter[1]; \
166 _to[2] = (to_type) _iter[2]; \
167 _to[3] = (to_type) _iter[3]; \
168 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200169 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200170 while (_iter < (_end)) \
171 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200172 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200173
Walter Dörwald16807132007-05-25 13:52:07 +0000174/* This dictionary holds all interned unicode strings. Note that references
175 to strings in this dictionary are *not* counted in the string's ob_refcnt.
176 When the interned string reaches a refcnt of 0 the string deallocation
177 function will delete the reference from this dictionary.
178
179 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000180 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000181*/
182static PyObject *interned;
183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200185static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000186
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200187/* List of static strings. */
188static _Py_Identifier *static_strings;
189
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190/* Single character Unicode strings in the Latin-1 range are being
191 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200192static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193
Christian Heimes190d79e2008-01-30 11:58:22 +0000194/* Fast detection of the most frequent whitespace characters */
195const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000196 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000197/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000198/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000199/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000200/* case 0x000C: * FORM FEED */
201/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000202 0, 1, 1, 1, 1, 1, 0, 0,
203 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000204/* case 0x001C: * FILE SEPARATOR */
205/* case 0x001D: * GROUP SEPARATOR */
206/* case 0x001E: * RECORD SEPARATOR */
207/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000208 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000209/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 1, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000214
Benjamin Peterson14339b62009-01-31 16:36:08 +0000215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
218 0, 0, 0, 0, 0, 0, 0, 0,
219 0, 0, 0, 0, 0, 0, 0, 0,
220 0, 0, 0, 0, 0, 0, 0, 0,
221 0, 0, 0, 0, 0, 0, 0, 0,
222 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000223};
224
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200225/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200226static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200227static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200228static void copy_characters(
229 PyObject *to, Py_ssize_t to_start,
230 PyObject *from, Py_ssize_t from_start,
231 Py_ssize_t how_many);
Victor Stinner488fa492011-12-12 00:01:39 +0100232static int unicode_modifiable(PyObject *unicode);
233
Victor Stinnerfe226c02011-10-03 03:52:20 +0200234
Alexander Belopolsky40018472011-02-26 01:02:56 +0000235static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200236unicode_fromascii(const unsigned char *s, Py_ssize_t size);
237static PyObject *
238_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
239static PyObject *
240_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
241static PyObject *
242_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
243
244static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000245unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000246 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100247 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000248 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
249
Alexander Belopolsky40018472011-02-26 01:02:56 +0000250static void
251raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300252 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100253 PyObject *unicode,
254 Py_ssize_t startpos, Py_ssize_t endpos,
255 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000256
Christian Heimes190d79e2008-01-30 11:58:22 +0000257/* Same for linebreaks */
258static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000259 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000260/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000261/* 0x000B, * LINE TABULATION */
262/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000263/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000264 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000265 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000266/* 0x001C, * FILE SEPARATOR */
267/* 0x001D, * GROUP SEPARATOR */
268/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000269 0, 0, 0, 0, 1, 1, 1, 0,
270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000274
Benjamin Peterson14339b62009-01-31 16:36:08 +0000275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
277 0, 0, 0, 0, 0, 0, 0, 0,
278 0, 0, 0, 0, 0, 0, 0, 0,
279 0, 0, 0, 0, 0, 0, 0, 0,
280 0, 0, 0, 0, 0, 0, 0, 0,
281 0, 0, 0, 0, 0, 0, 0, 0,
282 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000283};
284
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300285/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
286 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000287Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000288PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000289{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000290#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000291 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000292#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000293 /* This is actually an illegal character, so it should
294 not be passed to unichr. */
295 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000296#endif
297}
298
Victor Stinner910337b2011-10-03 03:20:16 +0200299#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200300int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100301_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200302{
303 PyASCIIObject *ascii;
304 unsigned int kind;
305
306 assert(PyUnicode_Check(op));
307
308 ascii = (PyASCIIObject *)op;
309 kind = ascii->state.kind;
310
Victor Stinnera3b334d2011-10-03 13:53:37 +0200311 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200312 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200313 assert(ascii->state.ready == 1);
314 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200315 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200316 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200317 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200318
Victor Stinnera41463c2011-10-04 01:05:08 +0200319 if (ascii->state.compact == 1) {
320 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200321 assert(kind == PyUnicode_1BYTE_KIND
322 || kind == PyUnicode_2BYTE_KIND
323 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200324 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200325 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200326 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100327 }
328 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200329 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
330
331 data = unicode->data.any;
332 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100333 assert(ascii->length == 0);
334 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200335 assert(ascii->state.compact == 0);
336 assert(ascii->state.ascii == 0);
337 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100338 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200339 assert(ascii->wstr != NULL);
340 assert(data == NULL);
341 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200342 }
343 else {
344 assert(kind == PyUnicode_1BYTE_KIND
345 || kind == PyUnicode_2BYTE_KIND
346 || kind == PyUnicode_4BYTE_KIND);
347 assert(ascii->state.compact == 0);
348 assert(ascii->state.ready == 1);
349 assert(data != NULL);
350 if (ascii->state.ascii) {
351 assert (compact->utf8 == data);
352 assert (compact->utf8_length == ascii->length);
353 }
354 else
355 assert (compact->utf8 != data);
356 }
357 }
358 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200359 if (
360#if SIZEOF_WCHAR_T == 2
361 kind == PyUnicode_2BYTE_KIND
362#else
363 kind == PyUnicode_4BYTE_KIND
364#endif
365 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200366 {
367 assert(ascii->wstr == data);
368 assert(compact->wstr_length == ascii->length);
369 } else
370 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200371 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200372
373 if (compact->utf8 == NULL)
374 assert(compact->utf8_length == 0);
375 if (ascii->wstr == NULL)
376 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200377 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200378 /* check that the best kind is used */
379 if (check_content && kind != PyUnicode_WCHAR_KIND)
380 {
381 Py_ssize_t i;
382 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200383 void *data;
384 Py_UCS4 ch;
385
386 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200387 for (i=0; i < ascii->length; i++)
388 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200389 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200390 if (ch > maxchar)
391 maxchar = ch;
392 }
393 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100394 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200395 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100396 assert(maxchar <= 255);
397 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200398 else
399 assert(maxchar < 128);
400 }
Victor Stinner77faf692011-11-20 18:56:05 +0100401 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200402 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100403 assert(maxchar <= 0xFFFF);
404 }
405 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200406 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100407 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100408 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200409 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200410 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400411 return 1;
412}
Victor Stinner910337b2011-10-03 03:20:16 +0200413#endif
414
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100415static PyObject*
416unicode_result_wchar(PyObject *unicode)
417{
418#ifndef Py_DEBUG
419 Py_ssize_t len;
420
421 assert(Py_REFCNT(unicode) == 1);
422
423 len = _PyUnicode_WSTR_LENGTH(unicode);
424 if (len == 0) {
425 Py_INCREF(unicode_empty);
426 Py_DECREF(unicode);
427 return unicode_empty;
428 }
429
430 if (len == 1) {
431 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
432 if (ch < 256) {
433 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
434 Py_DECREF(unicode);
435 return latin1_char;
436 }
437 }
438
439 if (_PyUnicode_Ready(unicode) < 0) {
440 Py_XDECREF(unicode);
441 return NULL;
442 }
443#else
444 /* don't make the result ready in debug mode to ensure that the caller
445 makes the string ready before using it */
446 assert(_PyUnicode_CheckConsistency(unicode, 1));
447#endif
448 return unicode;
449}
450
451static PyObject*
452unicode_result_ready(PyObject *unicode)
453{
454 Py_ssize_t length;
455
456 length = PyUnicode_GET_LENGTH(unicode);
457 if (length == 0) {
458 if (unicode != unicode_empty) {
459 Py_INCREF(unicode_empty);
460 Py_DECREF(unicode);
461 }
462 return unicode_empty;
463 }
464
465 if (length == 1) {
466 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
467 if (ch < 256) {
468 PyObject *latin1_char = unicode_latin1[ch];
469 if (latin1_char != NULL) {
470 if (unicode != latin1_char) {
471 Py_INCREF(latin1_char);
472 Py_DECREF(unicode);
473 }
474 return latin1_char;
475 }
476 else {
477 assert(_PyUnicode_CheckConsistency(unicode, 1));
478 Py_INCREF(unicode);
479 unicode_latin1[ch] = unicode;
480 return unicode;
481 }
482 }
483 }
484
485 assert(_PyUnicode_CheckConsistency(unicode, 1));
486 return unicode;
487}
488
489static PyObject*
490unicode_result(PyObject *unicode)
491{
492 assert(_PyUnicode_CHECK(unicode));
493 if (PyUnicode_IS_READY(unicode))
494 return unicode_result_ready(unicode);
495 else
496 return unicode_result_wchar(unicode);
497}
498
Victor Stinnerc4b49542011-12-11 22:44:26 +0100499static PyObject*
500unicode_result_unchanged(PyObject *unicode)
501{
502 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500503 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100504 return NULL;
505 Py_INCREF(unicode);
506 return unicode;
507 }
508 else
509 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100510 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100511}
512
Victor Stinner3a50e702011-10-18 21:21:00 +0200513#ifdef HAVE_MBCS
514static OSVERSIONINFOEX winver;
515#endif
516
Thomas Wouters477c8d52006-05-27 19:21:47 +0000517/* --- Bloom Filters ----------------------------------------------------- */
518
519/* stuff to implement simple "bloom filters" for Unicode characters.
520 to keep things simple, we use a single bitmask, using the least 5
521 bits from each unicode characters as the bit index. */
522
523/* the linebreak mask is set up by Unicode_Init below */
524
Antoine Pitrouf068f942010-01-13 14:19:12 +0000525#if LONG_BIT >= 128
526#define BLOOM_WIDTH 128
527#elif LONG_BIT >= 64
528#define BLOOM_WIDTH 64
529#elif LONG_BIT >= 32
530#define BLOOM_WIDTH 32
531#else
532#error "LONG_BIT is smaller than 32"
533#endif
534
Thomas Wouters477c8d52006-05-27 19:21:47 +0000535#define BLOOM_MASK unsigned long
536
537static BLOOM_MASK bloom_linebreak;
538
Antoine Pitrouf068f942010-01-13 14:19:12 +0000539#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
540#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000541
Benjamin Peterson29060642009-01-31 22:14:21 +0000542#define BLOOM_LINEBREAK(ch) \
543 ((ch) < 128U ? ascii_linebreak[(ch)] : \
544 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000545
Alexander Belopolsky40018472011-02-26 01:02:56 +0000546Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200547make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000548{
549 /* calculate simple bloom-style bitmask for a given unicode string */
550
Antoine Pitrouf068f942010-01-13 14:19:12 +0000551 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000552 Py_ssize_t i;
553
554 mask = 0;
555 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200556 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000557
558 return mask;
559}
560
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200561#define BLOOM_MEMBER(mask, chr, str) \
562 (BLOOM(mask, chr) \
563 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000564
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200565/* Compilation of templated routines */
566
567#include "stringlib/asciilib.h"
568#include "stringlib/fastsearch.h"
569#include "stringlib/partition.h"
570#include "stringlib/split.h"
571#include "stringlib/count.h"
572#include "stringlib/find.h"
573#include "stringlib/find_max_char.h"
574#include "stringlib/localeutil.h"
575#include "stringlib/undef.h"
576
577#include "stringlib/ucs1lib.h"
578#include "stringlib/fastsearch.h"
579#include "stringlib/partition.h"
580#include "stringlib/split.h"
581#include "stringlib/count.h"
582#include "stringlib/find.h"
583#include "stringlib/find_max_char.h"
584#include "stringlib/localeutil.h"
585#include "stringlib/undef.h"
586
587#include "stringlib/ucs2lib.h"
588#include "stringlib/fastsearch.h"
589#include "stringlib/partition.h"
590#include "stringlib/split.h"
591#include "stringlib/count.h"
592#include "stringlib/find.h"
593#include "stringlib/find_max_char.h"
594#include "stringlib/localeutil.h"
595#include "stringlib/undef.h"
596
597#include "stringlib/ucs4lib.h"
598#include "stringlib/fastsearch.h"
599#include "stringlib/partition.h"
600#include "stringlib/split.h"
601#include "stringlib/count.h"
602#include "stringlib/find.h"
603#include "stringlib/find_max_char.h"
604#include "stringlib/localeutil.h"
605#include "stringlib/undef.h"
606
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200607#include "stringlib/unicodedefs.h"
608#include "stringlib/fastsearch.h"
609#include "stringlib/count.h"
610#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100611#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200612
Guido van Rossumd57fd912000-03-10 22:53:23 +0000613/* --- Unicode Object ----------------------------------------------------- */
614
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200615static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200616fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200617
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200618Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
619 Py_ssize_t size, Py_UCS4 ch,
620 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200621{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200622 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
623
624 switch (kind) {
625 case PyUnicode_1BYTE_KIND:
626 {
627 Py_UCS1 ch1 = (Py_UCS1) ch;
628 if (ch1 == ch)
629 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
630 else
631 return -1;
632 }
633 case PyUnicode_2BYTE_KIND:
634 {
635 Py_UCS2 ch2 = (Py_UCS2) ch;
636 if (ch2 == ch)
637 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
638 else
639 return -1;
640 }
641 case PyUnicode_4BYTE_KIND:
642 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
643 default:
644 assert(0);
645 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200646 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200647}
648
Victor Stinnerfe226c02011-10-03 03:52:20 +0200649static PyObject*
650resize_compact(PyObject *unicode, Py_ssize_t length)
651{
652 Py_ssize_t char_size;
653 Py_ssize_t struct_size;
654 Py_ssize_t new_size;
655 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100656 PyObject *new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200657 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100658 assert(PyUnicode_IS_COMPACT(unicode));
659
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200660 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100661 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200662 struct_size = sizeof(PyASCIIObject);
663 else
664 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200665 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200666
Victor Stinnerfe226c02011-10-03 03:52:20 +0200667 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
668 PyErr_NoMemory();
669 return NULL;
670 }
671 new_size = (struct_size + (length + 1) * char_size);
672
Victor Stinner84def372011-12-11 20:04:56 +0100673 _Py_DEC_REFTOTAL;
674 _Py_ForgetReference(unicode);
675
676 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
677 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100678 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200679 PyErr_NoMemory();
680 return NULL;
681 }
Victor Stinner84def372011-12-11 20:04:56 +0100682 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200683 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100684
Victor Stinnerfe226c02011-10-03 03:52:20 +0200685 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200686 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200687 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100688 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200689 _PyUnicode_WSTR_LENGTH(unicode) = length;
690 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200691 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
692 length, 0);
693 return unicode;
694}
695
Alexander Belopolsky40018472011-02-26 01:02:56 +0000696static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200697resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000698{
Victor Stinner95663112011-10-04 01:03:50 +0200699 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100700 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200701 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200702 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000703
Victor Stinnerfe226c02011-10-03 03:52:20 +0200704 if (PyUnicode_IS_READY(unicode)) {
705 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200706 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200707 void *data;
708
709 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200710 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200711 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
712 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200713
714 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
715 PyErr_NoMemory();
716 return -1;
717 }
718 new_size = (length + 1) * char_size;
719
Victor Stinner7a9105a2011-12-12 00:13:42 +0100720 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
721 {
722 PyObject_DEL(_PyUnicode_UTF8(unicode));
723 _PyUnicode_UTF8(unicode) = NULL;
724 _PyUnicode_UTF8_LENGTH(unicode) = 0;
725 }
726
Victor Stinnerfe226c02011-10-03 03:52:20 +0200727 data = (PyObject *)PyObject_REALLOC(data, new_size);
728 if (data == NULL) {
729 PyErr_NoMemory();
730 return -1;
731 }
732 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200733 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200734 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200735 _PyUnicode_WSTR_LENGTH(unicode) = length;
736 }
737 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200738 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200739 _PyUnicode_UTF8_LENGTH(unicode) = length;
740 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200741 _PyUnicode_LENGTH(unicode) = length;
742 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200743 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200744 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200745 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200746 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200747 }
Victor Stinner95663112011-10-04 01:03:50 +0200748 assert(_PyUnicode_WSTR(unicode) != NULL);
749
750 /* check for integer overflow */
751 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
752 PyErr_NoMemory();
753 return -1;
754 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100755 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200756 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100757 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200758 if (!wstr) {
759 PyErr_NoMemory();
760 return -1;
761 }
762 _PyUnicode_WSTR(unicode) = wstr;
763 _PyUnicode_WSTR(unicode)[length] = 0;
764 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200765 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000766 return 0;
767}
768
Victor Stinnerfe226c02011-10-03 03:52:20 +0200769static PyObject*
770resize_copy(PyObject *unicode, Py_ssize_t length)
771{
772 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100773 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200774 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100775
Benjamin Petersonbac79492012-01-14 13:34:47 -0500776 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100777 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200778
779 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
780 if (copy == NULL)
781 return NULL;
782
783 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200784 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200785 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200786 }
787 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200788 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100789
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200790 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200791 if (w == NULL)
792 return NULL;
793 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
794 copy_length = Py_MIN(copy_length, length);
795 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
796 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200797 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200798 }
799}
800
Guido van Rossumd57fd912000-03-10 22:53:23 +0000801/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000802 Ux0000 terminated; some code (e.g. new_identifier)
803 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000804
805 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000806 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000807
808*/
809
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200810#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200811static int unicode_old_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200812#endif
813
Alexander Belopolsky40018472011-02-26 01:02:56 +0000814static PyUnicodeObject *
815_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000816{
817 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200818 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000819
Thomas Wouters477c8d52006-05-27 19:21:47 +0000820 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000821 if (length == 0 && unicode_empty != NULL) {
822 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200823 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000824 }
825
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000826 /* Ensure we won't overflow the size. */
827 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
828 return (PyUnicodeObject *)PyErr_NoMemory();
829 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200830 if (length < 0) {
831 PyErr_SetString(PyExc_SystemError,
832 "Negative size passed to _PyUnicode_New");
833 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000834 }
835
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200836#ifdef Py_DEBUG
837 ++unicode_old_new_calls;
838#endif
839
840 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
841 if (unicode == NULL)
842 return NULL;
843 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
844 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
845 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100846 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000847 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100848 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000849 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200850
Jeremy Hyltond8082792003-09-16 19:41:39 +0000851 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000852 * the caller fails before initializing str -- unicode_resize()
853 * reads str[0], and the Keep-Alive optimization can keep memory
854 * allocated for str alive across a call to unicode_dealloc(unicode).
855 * We don't want unicode_resize to read uninitialized memory in
856 * that case.
857 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200858 _PyUnicode_WSTR(unicode)[0] = 0;
859 _PyUnicode_WSTR(unicode)[length] = 0;
860 _PyUnicode_WSTR_LENGTH(unicode) = length;
861 _PyUnicode_HASH(unicode) = -1;
862 _PyUnicode_STATE(unicode).interned = 0;
863 _PyUnicode_STATE(unicode).kind = 0;
864 _PyUnicode_STATE(unicode).compact = 0;
865 _PyUnicode_STATE(unicode).ready = 0;
866 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200867 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200868 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200869 _PyUnicode_UTF8(unicode) = NULL;
870 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100871 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000872 return unicode;
873}
874
Victor Stinnerf42dc442011-10-02 23:33:16 +0200875static const char*
876unicode_kind_name(PyObject *unicode)
877{
Victor Stinner42dfd712011-10-03 14:41:45 +0200878 /* don't check consistency: unicode_kind_name() is called from
879 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200880 if (!PyUnicode_IS_COMPACT(unicode))
881 {
882 if (!PyUnicode_IS_READY(unicode))
883 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600884 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200885 {
886 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200887 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200888 return "legacy ascii";
889 else
890 return "legacy latin1";
891 case PyUnicode_2BYTE_KIND:
892 return "legacy UCS2";
893 case PyUnicode_4BYTE_KIND:
894 return "legacy UCS4";
895 default:
896 return "<legacy invalid kind>";
897 }
898 }
899 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600900 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200901 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200902 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200903 return "ascii";
904 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200905 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200906 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200907 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200908 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200909 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200910 default:
911 return "<invalid compact kind>";
912 }
913}
914
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200915#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200916static int unicode_new_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200917
918/* Functions wrapping macros for use in debugger */
919char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200920 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200921}
922
923void *_PyUnicode_compact_data(void *unicode) {
924 return _PyUnicode_COMPACT_DATA(unicode);
925}
926void *_PyUnicode_data(void *unicode){
927 printf("obj %p\n", unicode);
928 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
929 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
930 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
931 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
932 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
933 return PyUnicode_DATA(unicode);
934}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200935
936void
937_PyUnicode_Dump(PyObject *op)
938{
939 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200940 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
941 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
942 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200943
Victor Stinnera849a4b2011-10-03 12:12:11 +0200944 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200945 {
946 if (ascii->state.ascii)
947 data = (ascii + 1);
948 else
949 data = (compact + 1);
950 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200951 else
952 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200953 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
954
Victor Stinnera849a4b2011-10-03 12:12:11 +0200955 if (ascii->wstr == data)
956 printf("shared ");
957 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200958
Victor Stinnera3b334d2011-10-03 13:53:37 +0200959 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200960 printf(" (%zu), ", compact->wstr_length);
961 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
962 printf("shared ");
963 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200964 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200965 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200966}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200967#endif
968
969PyObject *
970PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
971{
972 PyObject *obj;
973 PyCompactUnicodeObject *unicode;
974 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +0200975 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200976 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200977 Py_ssize_t char_size;
978 Py_ssize_t struct_size;
979
980 /* Optimization for empty strings */
981 if (size == 0 && unicode_empty != NULL) {
982 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200983 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200984 }
985
986#ifdef Py_DEBUG
987 ++unicode_new_new_calls;
988#endif
989
Victor Stinner9e9d6892011-10-04 01:02:02 +0200990 is_ascii = 0;
991 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200992 struct_size = sizeof(PyCompactUnicodeObject);
993 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +0200994 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200995 char_size = 1;
996 is_ascii = 1;
997 struct_size = sizeof(PyASCIIObject);
998 }
999 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001000 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001001 char_size = 1;
1002 }
1003 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001004 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001005 char_size = 2;
1006 if (sizeof(wchar_t) == 2)
1007 is_sharing = 1;
1008 }
1009 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001010 if (maxchar > MAX_UNICODE) {
1011 PyErr_SetString(PyExc_SystemError,
1012 "invalid maximum character passed to PyUnicode_New");
1013 return NULL;
1014 }
Victor Stinner8f825062012-04-27 13:55:39 +02001015 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001016 char_size = 4;
1017 if (sizeof(wchar_t) == 4)
1018 is_sharing = 1;
1019 }
1020
1021 /* Ensure we won't overflow the size. */
1022 if (size < 0) {
1023 PyErr_SetString(PyExc_SystemError,
1024 "Negative size passed to PyUnicode_New");
1025 return NULL;
1026 }
1027 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1028 return PyErr_NoMemory();
1029
1030 /* Duplicated allocation code from _PyObject_New() instead of a call to
1031 * PyObject_New() so we are able to allocate space for the object and
1032 * it's data buffer.
1033 */
1034 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1035 if (obj == NULL)
1036 return PyErr_NoMemory();
1037 obj = PyObject_INIT(obj, &PyUnicode_Type);
1038 if (obj == NULL)
1039 return NULL;
1040
1041 unicode = (PyCompactUnicodeObject *)obj;
1042 if (is_ascii)
1043 data = ((PyASCIIObject*)obj) + 1;
1044 else
1045 data = unicode + 1;
1046 _PyUnicode_LENGTH(unicode) = size;
1047 _PyUnicode_HASH(unicode) = -1;
1048 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001049 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001050 _PyUnicode_STATE(unicode).compact = 1;
1051 _PyUnicode_STATE(unicode).ready = 1;
1052 _PyUnicode_STATE(unicode).ascii = is_ascii;
1053 if (is_ascii) {
1054 ((char*)data)[size] = 0;
1055 _PyUnicode_WSTR(unicode) = NULL;
1056 }
Victor Stinner8f825062012-04-27 13:55:39 +02001057 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001058 ((char*)data)[size] = 0;
1059 _PyUnicode_WSTR(unicode) = NULL;
1060 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001061 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001062 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001063 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001064 else {
1065 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001066 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001067 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001068 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001069 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001070 ((Py_UCS4*)data)[size] = 0;
1071 if (is_sharing) {
1072 _PyUnicode_WSTR_LENGTH(unicode) = size;
1073 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1074 }
1075 else {
1076 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1077 _PyUnicode_WSTR(unicode) = NULL;
1078 }
1079 }
Victor Stinner8f825062012-04-27 13:55:39 +02001080#ifdef Py_DEBUG
1081 /* Fill the data with invalid characters to detect bugs earlier.
1082 _PyUnicode_CheckConsistency(str, 1) detects invalid characters,
1083 at least for ASCII and UCS-4 strings. U+00FF is invalid in ASCII
1084 and U+FFFFFFFF is an invalid character in Unicode 6.0. */
1085 memset(data, 0xff, size * kind);
1086#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001087 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001088 return obj;
1089}
1090
1091#if SIZEOF_WCHAR_T == 2
1092/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1093 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001094 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001095
1096 This function assumes that unicode can hold one more code point than wstr
1097 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001098static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001099unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001100 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001101{
1102 const wchar_t *iter;
1103 Py_UCS4 *ucs4_out;
1104
Victor Stinner910337b2011-10-03 03:20:16 +02001105 assert(unicode != NULL);
1106 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001107 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1108 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1109
1110 for (iter = begin; iter < end; ) {
1111 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1112 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001113 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1114 && (iter+1) < end
1115 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001116 {
Victor Stinner551ac952011-11-29 22:58:13 +01001117 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001118 iter += 2;
1119 }
1120 else {
1121 *ucs4_out++ = *iter;
1122 iter++;
1123 }
1124 }
1125 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1126 _PyUnicode_GET_LENGTH(unicode)));
1127
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001128}
1129#endif
1130
Victor Stinnercd9950f2011-10-02 00:34:53 +02001131static int
Victor Stinner488fa492011-12-12 00:01:39 +01001132unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001133{
Victor Stinner488fa492011-12-12 00:01:39 +01001134 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001135 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001136 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001137 return -1;
1138 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001139 return 0;
1140}
1141
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001142static int
1143_copy_characters(PyObject *to, Py_ssize_t to_start,
1144 PyObject *from, Py_ssize_t from_start,
1145 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001146{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001147 unsigned int from_kind, to_kind;
1148 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001149 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001150
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001151 assert(PyUnicode_Check(from));
1152 assert(PyUnicode_Check(to));
1153 assert(PyUnicode_IS_READY(from));
1154 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001155
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001156 assert(PyUnicode_GET_LENGTH(from) >= how_many);
1157 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1158 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001159
Victor Stinnerf5ca1a22011-09-28 23:54:59 +02001160 if (how_many == 0)
1161 return 0;
1162
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001163 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001164 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001165 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001166 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001167
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001168#ifdef Py_DEBUG
1169 if (!check_maxchar
1170 && (from_kind > to_kind
1171 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001172 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001173 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1174 Py_UCS4 ch;
1175 Py_ssize_t i;
1176 for (i=0; i < how_many; i++) {
1177 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1178 assert(ch <= to_maxchar);
1179 }
1180 }
1181#endif
1182 fast = (from_kind == to_kind);
1183 if (check_maxchar
1184 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1185 {
1186 /* deny latin1 => ascii */
1187 fast = 0;
1188 }
1189
1190 if (fast) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001191 Py_MEMCPY((char*)to_data + to_kind * to_start,
1192 (char*)from_data + from_kind * from_start,
1193 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001194 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001195 else if (from_kind == PyUnicode_1BYTE_KIND
1196 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001197 {
1198 _PyUnicode_CONVERT_BYTES(
1199 Py_UCS1, Py_UCS2,
1200 PyUnicode_1BYTE_DATA(from) + from_start,
1201 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1202 PyUnicode_2BYTE_DATA(to) + to_start
1203 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001204 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001205 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001206 && to_kind == PyUnicode_4BYTE_KIND)
1207 {
1208 _PyUnicode_CONVERT_BYTES(
1209 Py_UCS1, Py_UCS4,
1210 PyUnicode_1BYTE_DATA(from) + from_start,
1211 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1212 PyUnicode_4BYTE_DATA(to) + to_start
1213 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001214 }
1215 else if (from_kind == PyUnicode_2BYTE_KIND
1216 && to_kind == PyUnicode_4BYTE_KIND)
1217 {
1218 _PyUnicode_CONVERT_BYTES(
1219 Py_UCS2, Py_UCS4,
1220 PyUnicode_2BYTE_DATA(from) + from_start,
1221 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1222 PyUnicode_4BYTE_DATA(to) + to_start
1223 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001224 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001225 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001226 /* check if max_char(from substring) <= max_char(to) */
1227 if (from_kind > to_kind
1228 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001229 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001230 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001231 /* slow path to check for character overflow */
1232 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001233 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001234 Py_ssize_t i;
1235
Victor Stinner56c161a2011-10-06 02:47:11 +02001236#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001237 for (i=0; i < how_many; i++) {
1238 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001239 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001240 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1241 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001242#else
1243 if (!check_maxchar) {
1244 for (i=0; i < how_many; i++) {
1245 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1246 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1247 }
1248 }
1249 else {
1250 for (i=0; i < how_many; i++) {
1251 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1252 if (ch > to_maxchar)
1253 return 1;
1254 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1255 }
1256 }
1257#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001258 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001259 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001260 assert(0 && "inconsistent state");
1261 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001262 }
1263 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001264 return 0;
1265}
1266
1267static void
1268copy_characters(PyObject *to, Py_ssize_t to_start,
1269 PyObject *from, Py_ssize_t from_start,
1270 Py_ssize_t how_many)
1271{
1272 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1273}
1274
1275Py_ssize_t
1276PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1277 PyObject *from, Py_ssize_t from_start,
1278 Py_ssize_t how_many)
1279{
1280 int err;
1281
1282 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1283 PyErr_BadInternalCall();
1284 return -1;
1285 }
1286
Benjamin Petersonbac79492012-01-14 13:34:47 -05001287 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001288 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001289 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001290 return -1;
1291
1292 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1293 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1294 PyErr_Format(PyExc_SystemError,
1295 "Cannot write %zi characters at %zi "
1296 "in a string of %zi characters",
1297 how_many, to_start, PyUnicode_GET_LENGTH(to));
1298 return -1;
1299 }
1300
1301 if (how_many == 0)
1302 return 0;
1303
Victor Stinner488fa492011-12-12 00:01:39 +01001304 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001305 return -1;
1306
1307 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1308 if (err) {
1309 PyErr_Format(PyExc_SystemError,
1310 "Cannot copy %s characters "
1311 "into a string of %s characters",
1312 unicode_kind_name(from),
1313 unicode_kind_name(to));
1314 return -1;
1315 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001316 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001317}
1318
Victor Stinner17222162011-09-28 22:15:37 +02001319/* Find the maximum code point and count the number of surrogate pairs so a
1320 correct string length can be computed before converting a string to UCS4.
1321 This function counts single surrogates as a character and not as a pair.
1322
1323 Return 0 on success, or -1 on error. */
1324static int
1325find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1326 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001327{
1328 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001329 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001330
Victor Stinnerc53be962011-10-02 21:33:54 +02001331 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001332 *num_surrogates = 0;
1333 *maxchar = 0;
1334
1335 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001336#if SIZEOF_WCHAR_T == 2
Victor Stinnerca4f2072011-11-22 03:38:40 +01001337 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1338 && (iter+1) < end
1339 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001340 {
Victor Stinner8faf8212011-12-08 22:14:11 +01001341 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001342 ++(*num_surrogates);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001343 iter += 2;
1344 }
1345 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001346#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001347 {
1348 ch = *iter;
1349 iter++;
1350 }
1351 if (ch > *maxchar) {
1352 *maxchar = ch;
1353 if (*maxchar > MAX_UNICODE) {
1354 PyErr_Format(PyExc_ValueError,
1355 "character U+%x is not in range [U+0000; U+10ffff]",
1356 ch);
1357 return -1;
1358 }
1359 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001360 }
1361 return 0;
1362}
1363
1364#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001365static int unicode_ready_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001366#endif
1367
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001368int
1369_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001370{
1371 wchar_t *end;
1372 Py_UCS4 maxchar = 0;
1373 Py_ssize_t num_surrogates;
1374#if SIZEOF_WCHAR_T == 2
1375 Py_ssize_t length_wo_surrogates;
1376#endif
1377
Georg Brandl7597add2011-10-05 16:36:47 +02001378 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001379 strings were created using _PyObject_New() and where no canonical
1380 representation (the str field) has been set yet aka strings
1381 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001382 assert(_PyUnicode_CHECK(unicode));
1383 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001384 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001385 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001386 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001387 /* Actually, it should neither be interned nor be anything else: */
1388 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001389
1390#ifdef Py_DEBUG
1391 ++unicode_ready_calls;
1392#endif
1393
1394 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001395 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001396 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001397 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001398
1399 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001400 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1401 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001402 PyErr_NoMemory();
1403 return -1;
1404 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001405 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001406 _PyUnicode_WSTR(unicode), end,
1407 PyUnicode_1BYTE_DATA(unicode));
1408 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1409 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1410 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1411 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001412 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001413 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001414 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001415 }
1416 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001417 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001418 _PyUnicode_UTF8(unicode) = NULL;
1419 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001420 }
1421 PyObject_FREE(_PyUnicode_WSTR(unicode));
1422 _PyUnicode_WSTR(unicode) = NULL;
1423 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1424 }
1425 /* In this case we might have to convert down from 4-byte native
1426 wchar_t to 2-byte unicode. */
1427 else if (maxchar < 65536) {
1428 assert(num_surrogates == 0 &&
1429 "FindMaxCharAndNumSurrogatePairs() messed up");
1430
Victor Stinner506f5922011-09-28 22:34:18 +02001431#if SIZEOF_WCHAR_T == 2
1432 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001433 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001434 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1435 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1436 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001437 _PyUnicode_UTF8(unicode) = NULL;
1438 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001439#else
1440 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001441 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001442 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001443 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001444 PyErr_NoMemory();
1445 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001446 }
Victor Stinner506f5922011-09-28 22:34:18 +02001447 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1448 _PyUnicode_WSTR(unicode), end,
1449 PyUnicode_2BYTE_DATA(unicode));
1450 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1451 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1452 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001453 _PyUnicode_UTF8(unicode) = NULL;
1454 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001455 PyObject_FREE(_PyUnicode_WSTR(unicode));
1456 _PyUnicode_WSTR(unicode) = NULL;
1457 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1458#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001459 }
1460 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1461 else {
1462#if SIZEOF_WCHAR_T == 2
1463 /* in case the native representation is 2-bytes, we need to allocate a
1464 new normalized 4-byte version. */
1465 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001466 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1467 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001468 PyErr_NoMemory();
1469 return -1;
1470 }
1471 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1472 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001473 _PyUnicode_UTF8(unicode) = NULL;
1474 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001475 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1476 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001477 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001478 PyObject_FREE(_PyUnicode_WSTR(unicode));
1479 _PyUnicode_WSTR(unicode) = NULL;
1480 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1481#else
1482 assert(num_surrogates == 0);
1483
Victor Stinnerc3c74152011-10-02 20:39:55 +02001484 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001485 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001486 _PyUnicode_UTF8(unicode) = NULL;
1487 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001488 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1489#endif
1490 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1491 }
1492 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001493 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001494 return 0;
1495}
1496
Alexander Belopolsky40018472011-02-26 01:02:56 +00001497static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001498unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001499{
Walter Dörwald16807132007-05-25 13:52:07 +00001500 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001501 case SSTATE_NOT_INTERNED:
1502 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001503
Benjamin Peterson29060642009-01-31 22:14:21 +00001504 case SSTATE_INTERNED_MORTAL:
1505 /* revive dead object temporarily for DelItem */
1506 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001507 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001508 Py_FatalError(
1509 "deletion of interned string failed");
1510 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001511
Benjamin Peterson29060642009-01-31 22:14:21 +00001512 case SSTATE_INTERNED_IMMORTAL:
1513 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001514
Benjamin Peterson29060642009-01-31 22:14:21 +00001515 default:
1516 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001517 }
1518
Victor Stinner03490912011-10-03 23:45:12 +02001519 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001520 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001521 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001522 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001523 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1524 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001525
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001526 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001527}
1528
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001529#ifdef Py_DEBUG
1530static int
1531unicode_is_singleton(PyObject *unicode)
1532{
1533 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1534 if (unicode == unicode_empty)
1535 return 1;
1536 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1537 {
1538 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1539 if (ch < 256 && unicode_latin1[ch] == unicode)
1540 return 1;
1541 }
1542 return 0;
1543}
1544#endif
1545
Alexander Belopolsky40018472011-02-26 01:02:56 +00001546static int
Victor Stinner488fa492011-12-12 00:01:39 +01001547unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001548{
Victor Stinner488fa492011-12-12 00:01:39 +01001549 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001550 if (Py_REFCNT(unicode) != 1)
1551 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001552 if (_PyUnicode_HASH(unicode) != -1)
1553 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001554 if (PyUnicode_CHECK_INTERNED(unicode))
1555 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001556 if (!PyUnicode_CheckExact(unicode))
1557 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001558#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001559 /* singleton refcount is greater than 1 */
1560 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001561#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001562 return 1;
1563}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001564
Victor Stinnerfe226c02011-10-03 03:52:20 +02001565static int
1566unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1567{
1568 PyObject *unicode;
1569 Py_ssize_t old_length;
1570
1571 assert(p_unicode != NULL);
1572 unicode = *p_unicode;
1573
1574 assert(unicode != NULL);
1575 assert(PyUnicode_Check(unicode));
1576 assert(0 <= length);
1577
Victor Stinner910337b2011-10-03 03:20:16 +02001578 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001579 old_length = PyUnicode_WSTR_LENGTH(unicode);
1580 else
1581 old_length = PyUnicode_GET_LENGTH(unicode);
1582 if (old_length == length)
1583 return 0;
1584
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001585 if (length == 0) {
1586 Py_DECREF(*p_unicode);
1587 *p_unicode = unicode_empty;
1588 Py_INCREF(*p_unicode);
1589 return 0;
1590 }
1591
Victor Stinner488fa492011-12-12 00:01:39 +01001592 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001593 PyObject *copy = resize_copy(unicode, length);
1594 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001595 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001596 Py_DECREF(*p_unicode);
1597 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001598 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001599 }
1600
Victor Stinnerfe226c02011-10-03 03:52:20 +02001601 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001602 PyObject *new_unicode = resize_compact(unicode, length);
1603 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001604 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001605 *p_unicode = new_unicode;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001606 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001607 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001608 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001609 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001610}
1611
Alexander Belopolsky40018472011-02-26 01:02:56 +00001612int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001613PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001614{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001615 PyObject *unicode;
1616 if (p_unicode == NULL) {
1617 PyErr_BadInternalCall();
1618 return -1;
1619 }
1620 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001621 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001622 {
1623 PyErr_BadInternalCall();
1624 return -1;
1625 }
1626 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001627}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001628
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001629static int
Victor Stinner1b487b42012-05-03 12:29:04 +02001630unicode_widen(PyObject **p_unicode, Py_ssize_t length,
1631 unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001632{
1633 PyObject *result;
1634 assert(PyUnicode_IS_READY(*p_unicode));
Victor Stinner1b487b42012-05-03 12:29:04 +02001635 assert(length <= PyUnicode_GET_LENGTH(*p_unicode));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001636 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1637 return 0;
1638 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1639 maxchar);
1640 if (result == NULL)
1641 return -1;
Victor Stinner1b487b42012-05-03 12:29:04 +02001642 PyUnicode_CopyCharacters(result, 0, *p_unicode, 0, length);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001643 Py_DECREF(*p_unicode);
1644 *p_unicode = result;
1645 return 0;
1646}
1647
1648static int
1649unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1650 Py_UCS4 ch)
1651{
Victor Stinner15e9ed22012-02-22 13:36:20 +01001652 assert(ch <= MAX_UNICODE);
Victor Stinner1b487b42012-05-03 12:29:04 +02001653 if (unicode_widen(p_unicode, *pos, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001654 return -1;
1655 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1656 PyUnicode_DATA(*p_unicode),
1657 (*pos)++, ch);
1658 return 0;
1659}
1660
Victor Stinnerc5166102012-02-22 13:55:02 +01001661/* Copy a ASCII or latin1 char* string into a Python Unicode string.
1662 Return the length of the input string.
1663
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001664 WARNING: The function doesn't copy the terminating null character and
1665 doesn't check the maximum character (may write a latin1 character in an
1666 ASCII string). */
Victor Stinnerc5166102012-02-22 13:55:02 +01001667static Py_ssize_t
1668unicode_write_cstr(PyObject *unicode, Py_ssize_t index, const char *str)
1669{
1670 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1671 void *data = PyUnicode_DATA(unicode);
1672
1673 switch (kind) {
1674 case PyUnicode_1BYTE_KIND: {
1675 Py_ssize_t len = strlen(str);
1676 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001677 memcpy((char *) data + index, str, len);
Victor Stinnerc5166102012-02-22 13:55:02 +01001678 return len;
1679 }
1680 case PyUnicode_2BYTE_KIND: {
1681 Py_UCS2 *start = (Py_UCS2 *)data + index;
1682 Py_UCS2 *ucs2 = start;
1683 assert(index <= PyUnicode_GET_LENGTH(unicode));
1684
1685 for (; *str; ++ucs2, ++str)
1686 *ucs2 = (Py_UCS2)*str;
1687
1688 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
1689 return ucs2 - start;
1690 }
1691 default: {
1692 Py_UCS4 *start = (Py_UCS4 *)data + index;
1693 Py_UCS4 *ucs4 = start;
1694 assert(kind == PyUnicode_4BYTE_KIND);
1695 assert(index <= PyUnicode_GET_LENGTH(unicode));
1696
1697 for (; *str; ++ucs4, ++str)
1698 *ucs4 = (Py_UCS4)*str;
1699
1700 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
1701 return ucs4 - start;
1702 }
1703 }
1704}
1705
1706
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001707static PyObject*
1708get_latin1_char(unsigned char ch)
1709{
Victor Stinnera464fc12011-10-02 20:39:30 +02001710 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001711 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001712 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001713 if (!unicode)
1714 return NULL;
1715 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001716 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001717 unicode_latin1[ch] = unicode;
1718 }
1719 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001720 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001721}
1722
Alexander Belopolsky40018472011-02-26 01:02:56 +00001723PyObject *
1724PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001725{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001726 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001727 Py_UCS4 maxchar = 0;
1728 Py_ssize_t num_surrogates;
1729
1730 if (u == NULL)
1731 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001732
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001733 /* If the Unicode data is known at construction time, we can apply
1734 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001735
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001736 /* Optimization for empty strings */
1737 if (size == 0 && unicode_empty != NULL) {
1738 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001739 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001740 }
Tim Petersced69f82003-09-16 20:30:58 +00001741
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001742 /* Single character Unicode objects in the Latin-1 range are
1743 shared when using this constructor */
1744 if (size == 1 && *u < 256)
1745 return get_latin1_char((unsigned char)*u);
1746
1747 /* If not empty and not single character, copy the Unicode data
1748 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001749 if (find_maxchar_surrogates(u, u + size,
1750 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001751 return NULL;
1752
Victor Stinner8faf8212011-12-08 22:14:11 +01001753 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001754 if (!unicode)
1755 return NULL;
1756
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001757 switch (PyUnicode_KIND(unicode)) {
1758 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001759 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001760 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1761 break;
1762 case PyUnicode_2BYTE_KIND:
1763#if Py_UNICODE_SIZE == 2
1764 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1765#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001766 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001767 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1768#endif
1769 break;
1770 case PyUnicode_4BYTE_KIND:
1771#if SIZEOF_WCHAR_T == 2
1772 /* This is the only case which has to process surrogates, thus
1773 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001774 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001775#else
1776 assert(num_surrogates == 0);
1777 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1778#endif
1779 break;
1780 default:
1781 assert(0 && "Impossible state");
1782 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001783
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001784 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001785}
1786
Alexander Belopolsky40018472011-02-26 01:02:56 +00001787PyObject *
1788PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001789{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001790 if (size < 0) {
1791 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001792 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001793 return NULL;
1794 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001795 if (u != NULL)
1796 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1797 else
1798 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001799}
1800
Alexander Belopolsky40018472011-02-26 01:02:56 +00001801PyObject *
1802PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001803{
1804 size_t size = strlen(u);
1805 if (size > PY_SSIZE_T_MAX) {
1806 PyErr_SetString(PyExc_OverflowError, "input too long");
1807 return NULL;
1808 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001809 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001810}
1811
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001812PyObject *
1813_PyUnicode_FromId(_Py_Identifier *id)
1814{
1815 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001816 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1817 strlen(id->string),
1818 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001819 if (!id->object)
1820 return NULL;
1821 PyUnicode_InternInPlace(&id->object);
1822 assert(!id->next);
1823 id->next = static_strings;
1824 static_strings = id;
1825 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001826 return id->object;
1827}
1828
1829void
1830_PyUnicode_ClearStaticStrings()
1831{
1832 _Py_Identifier *i;
1833 for (i = static_strings; i; i = i->next) {
1834 Py_DECREF(i->object);
1835 i->object = NULL;
1836 i->next = NULL;
1837 }
1838}
1839
Benjamin Peterson0df54292012-03-26 14:50:32 -04001840/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001841
Victor Stinnere57b1c02011-09-28 22:20:48 +02001842static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001843unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001844{
Victor Stinner785938e2011-12-11 20:09:03 +01001845 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001846 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001847#ifdef Py_DEBUG
Victor Stinnere6b2d442011-12-11 21:54:30 +01001848 assert(s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001849#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001850 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001851 }
Victor Stinner785938e2011-12-11 20:09:03 +01001852 unicode = PyUnicode_New(size, 127);
1853 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001854 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001855 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1856 assert(_PyUnicode_CheckConsistency(unicode, 1));
1857 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001858}
1859
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001860static Py_UCS4
1861kind_maxchar_limit(unsigned int kind)
1862{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001863 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001864 case PyUnicode_1BYTE_KIND:
1865 return 0x80;
1866 case PyUnicode_2BYTE_KIND:
1867 return 0x100;
1868 case PyUnicode_4BYTE_KIND:
1869 return 0x10000;
1870 default:
1871 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001872 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001873 }
1874}
1875
Victor Stinnere6abb482012-05-02 01:15:40 +02001876Py_LOCAL_INLINE(Py_UCS4)
1877align_maxchar(Py_UCS4 maxchar)
1878{
1879 if (maxchar <= 127)
1880 return 127;
1881 else if (maxchar <= 255)
1882 return 255;
1883 else if (maxchar <= 65535)
1884 return 65535;
1885 else
1886 return MAX_UNICODE;
1887}
1888
Victor Stinner702c7342011-10-05 13:50:52 +02001889static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001890_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001891{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001892 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001893 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001894
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001895 if (size == 0) {
1896 Py_INCREF(unicode_empty);
1897 return unicode_empty;
1898 }
1899 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001900 if (size == 1)
1901 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001902
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001903 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001904 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001905 if (!res)
1906 return NULL;
1907 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001908 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001909 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001910}
1911
Victor Stinnere57b1c02011-09-28 22:20:48 +02001912static PyObject*
1913_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001914{
1915 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001916 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001917
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001918 if (size == 0) {
1919 Py_INCREF(unicode_empty);
1920 return unicode_empty;
1921 }
1922 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001923 if (size == 1) {
1924 Py_UCS4 ch = u[0];
1925 if (ch < 256)
1926 return get_latin1_char((unsigned char)ch);
1927
1928 res = PyUnicode_New(1, ch);
1929 if (res == NULL)
1930 return NULL;
1931 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1932 assert(_PyUnicode_CheckConsistency(res, 1));
1933 return res;
1934 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001935
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001936 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001937 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001938 if (!res)
1939 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001940 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001941 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001942 else {
1943 _PyUnicode_CONVERT_BYTES(
1944 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1945 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001946 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001947 return res;
1948}
1949
Victor Stinnere57b1c02011-09-28 22:20:48 +02001950static PyObject*
1951_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001952{
1953 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001954 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001955
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001956 if (size == 0) {
1957 Py_INCREF(unicode_empty);
1958 return unicode_empty;
1959 }
1960 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001961 if (size == 1) {
1962 Py_UCS4 ch = u[0];
1963 if (ch < 256)
1964 return get_latin1_char((unsigned char)ch);
1965
1966 res = PyUnicode_New(1, ch);
1967 if (res == NULL)
1968 return NULL;
1969 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1970 assert(_PyUnicode_CheckConsistency(res, 1));
1971 return res;
1972 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001973
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001974 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001975 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001976 if (!res)
1977 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001978 if (max_char < 256)
1979 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1980 PyUnicode_1BYTE_DATA(res));
1981 else if (max_char < 0x10000)
1982 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1983 PyUnicode_2BYTE_DATA(res));
1984 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001985 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001986 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001987 return res;
1988}
1989
1990PyObject*
1991PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1992{
Victor Stinnercfed46e2011-11-22 01:29:14 +01001993 if (size < 0) {
1994 PyErr_SetString(PyExc_ValueError, "size must be positive");
1995 return NULL;
1996 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06001997 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001998 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001999 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002000 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002001 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002002 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002003 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002004 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002005 PyErr_SetString(PyExc_SystemError, "invalid kind");
2006 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002007 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002008}
2009
Victor Stinnerece58de2012-04-23 23:36:38 +02002010Py_UCS4
2011_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2012{
2013 enum PyUnicode_Kind kind;
2014 void *startptr, *endptr;
2015
2016 assert(PyUnicode_IS_READY(unicode));
2017 assert(0 <= start);
2018 assert(end <= PyUnicode_GET_LENGTH(unicode));
2019 assert(start <= end);
2020
2021 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2022 return PyUnicode_MAX_CHAR_VALUE(unicode);
2023
2024 if (start == end)
2025 return 127;
2026
Victor Stinner94d558b2012-04-27 22:26:58 +02002027 if (PyUnicode_IS_ASCII(unicode))
2028 return 127;
2029
Victor Stinnerece58de2012-04-23 23:36:38 +02002030 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002031 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002032 endptr = (char *)startptr + end * kind;
2033 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002034 switch(kind) {
2035 case PyUnicode_1BYTE_KIND:
2036 return ucs1lib_find_max_char(startptr, endptr);
2037 case PyUnicode_2BYTE_KIND:
2038 return ucs2lib_find_max_char(startptr, endptr);
2039 case PyUnicode_4BYTE_KIND:
2040 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002041 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002042 assert(0);
2043 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002044 }
2045}
2046
Victor Stinner25a4b292011-10-06 12:31:55 +02002047/* Ensure that a string uses the most efficient storage, if it is not the
2048 case: create a new string with of the right kind. Write NULL into *p_unicode
2049 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002050static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002051unicode_adjust_maxchar(PyObject **p_unicode)
2052{
2053 PyObject *unicode, *copy;
2054 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002055 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002056 unsigned int kind;
2057
2058 assert(p_unicode != NULL);
2059 unicode = *p_unicode;
2060 assert(PyUnicode_IS_READY(unicode));
2061 if (PyUnicode_IS_ASCII(unicode))
2062 return;
2063
2064 len = PyUnicode_GET_LENGTH(unicode);
2065 kind = PyUnicode_KIND(unicode);
2066 if (kind == PyUnicode_1BYTE_KIND) {
2067 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002068 max_char = ucs1lib_find_max_char(u, u + len);
2069 if (max_char >= 128)
2070 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002071 }
2072 else if (kind == PyUnicode_2BYTE_KIND) {
2073 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002074 max_char = ucs2lib_find_max_char(u, u + len);
2075 if (max_char >= 256)
2076 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002077 }
2078 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002079 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002080 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002081 max_char = ucs4lib_find_max_char(u, u + len);
2082 if (max_char >= 0x10000)
2083 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002084 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002085 copy = PyUnicode_New(len, max_char);
2086 copy_characters(copy, 0, unicode, 0, len);
2087 Py_DECREF(unicode);
2088 *p_unicode = copy;
2089}
2090
Victor Stinner034f6cf2011-09-30 02:26:44 +02002091PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002092_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002093{
Victor Stinner87af4f22011-11-21 23:03:47 +01002094 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002095 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002096
Victor Stinner034f6cf2011-09-30 02:26:44 +02002097 if (!PyUnicode_Check(unicode)) {
2098 PyErr_BadInternalCall();
2099 return NULL;
2100 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002101 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002102 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002103
Victor Stinner87af4f22011-11-21 23:03:47 +01002104 length = PyUnicode_GET_LENGTH(unicode);
2105 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002106 if (!copy)
2107 return NULL;
2108 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2109
Victor Stinner87af4f22011-11-21 23:03:47 +01002110 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2111 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002112 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002113 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002114}
2115
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002116
Victor Stinnerbc603d12011-10-02 01:00:40 +02002117/* Widen Unicode objects to larger buffers. Don't write terminating null
2118 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002119
2120void*
2121_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2122{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002123 Py_ssize_t len;
2124 void *result;
2125 unsigned int skind;
2126
Benjamin Petersonbac79492012-01-14 13:34:47 -05002127 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002128 return NULL;
2129
2130 len = PyUnicode_GET_LENGTH(s);
2131 skind = PyUnicode_KIND(s);
2132 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002133 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002134 return NULL;
2135 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002136 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002137 case PyUnicode_2BYTE_KIND:
2138 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2139 if (!result)
2140 return PyErr_NoMemory();
2141 assert(skind == PyUnicode_1BYTE_KIND);
2142 _PyUnicode_CONVERT_BYTES(
2143 Py_UCS1, Py_UCS2,
2144 PyUnicode_1BYTE_DATA(s),
2145 PyUnicode_1BYTE_DATA(s) + len,
2146 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002147 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002148 case PyUnicode_4BYTE_KIND:
2149 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2150 if (!result)
2151 return PyErr_NoMemory();
2152 if (skind == PyUnicode_2BYTE_KIND) {
2153 _PyUnicode_CONVERT_BYTES(
2154 Py_UCS2, Py_UCS4,
2155 PyUnicode_2BYTE_DATA(s),
2156 PyUnicode_2BYTE_DATA(s) + len,
2157 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002158 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002159 else {
2160 assert(skind == PyUnicode_1BYTE_KIND);
2161 _PyUnicode_CONVERT_BYTES(
2162 Py_UCS1, Py_UCS4,
2163 PyUnicode_1BYTE_DATA(s),
2164 PyUnicode_1BYTE_DATA(s) + len,
2165 result);
2166 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002167 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002168 default:
2169 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002170 }
Victor Stinner01698042011-10-04 00:04:26 +02002171 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002172 return NULL;
2173}
2174
2175static Py_UCS4*
2176as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2177 int copy_null)
2178{
2179 int kind;
2180 void *data;
2181 Py_ssize_t len, targetlen;
2182 if (PyUnicode_READY(string) == -1)
2183 return NULL;
2184 kind = PyUnicode_KIND(string);
2185 data = PyUnicode_DATA(string);
2186 len = PyUnicode_GET_LENGTH(string);
2187 targetlen = len;
2188 if (copy_null)
2189 targetlen++;
2190 if (!target) {
2191 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2192 PyErr_NoMemory();
2193 return NULL;
2194 }
2195 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2196 if (!target) {
2197 PyErr_NoMemory();
2198 return NULL;
2199 }
2200 }
2201 else {
2202 if (targetsize < targetlen) {
2203 PyErr_Format(PyExc_SystemError,
2204 "string is longer than the buffer");
2205 if (copy_null && 0 < targetsize)
2206 target[0] = 0;
2207 return NULL;
2208 }
2209 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002210 if (kind == PyUnicode_1BYTE_KIND) {
2211 Py_UCS1 *start = (Py_UCS1 *) data;
2212 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002213 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002214 else if (kind == PyUnicode_2BYTE_KIND) {
2215 Py_UCS2 *start = (Py_UCS2 *) data;
2216 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2217 }
2218 else {
2219 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002220 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002221 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002222 if (copy_null)
2223 target[len] = 0;
2224 return target;
2225}
2226
2227Py_UCS4*
2228PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2229 int copy_null)
2230{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002231 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002232 PyErr_BadInternalCall();
2233 return NULL;
2234 }
2235 return as_ucs4(string, target, targetsize, copy_null);
2236}
2237
2238Py_UCS4*
2239PyUnicode_AsUCS4Copy(PyObject *string)
2240{
2241 return as_ucs4(string, NULL, 0, 1);
2242}
2243
2244#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002245
Alexander Belopolsky40018472011-02-26 01:02:56 +00002246PyObject *
2247PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002248{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002249 if (w == NULL) {
Victor Stinner382955f2011-12-11 21:44:00 +01002250 if (size == 0) {
2251 Py_INCREF(unicode_empty);
2252 return unicode_empty;
2253 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002254 PyErr_BadInternalCall();
2255 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002256 }
2257
Martin v. Löwis790465f2008-04-05 20:41:37 +00002258 if (size == -1) {
2259 size = wcslen(w);
2260 }
2261
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002262 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002263}
2264
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002265#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002266
Walter Dörwald346737f2007-05-31 10:44:43 +00002267static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002268makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2269 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002270{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002271 *fmt++ = '%';
2272 if (width) {
2273 if (zeropad)
2274 *fmt++ = '0';
2275 fmt += sprintf(fmt, "%d", width);
2276 }
2277 if (precision)
2278 fmt += sprintf(fmt, ".%d", precision);
2279 if (longflag)
2280 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002281 else if (longlongflag) {
2282 /* longlongflag should only ever be nonzero on machines with
2283 HAVE_LONG_LONG defined */
2284#ifdef HAVE_LONG_LONG
2285 char *f = PY_FORMAT_LONG_LONG;
2286 while (*f)
2287 *fmt++ = *f++;
2288#else
2289 /* we shouldn't ever get here */
2290 assert(0);
2291 *fmt++ = 'l';
2292#endif
2293 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002294 else if (size_tflag) {
2295 char *f = PY_FORMAT_SIZE_T;
2296 while (*f)
2297 *fmt++ = *f++;
2298 }
2299 *fmt++ = c;
2300 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002301}
2302
Victor Stinner96865452011-03-01 23:44:09 +00002303/* helper for PyUnicode_FromFormatV() */
2304
2305static const char*
2306parse_format_flags(const char *f,
2307 int *p_width, int *p_precision,
2308 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2309{
2310 int width, precision, longflag, longlongflag, size_tflag;
2311
2312 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2313 f++;
2314 width = 0;
2315 while (Py_ISDIGIT((unsigned)*f))
2316 width = (width*10) + *f++ - '0';
2317 precision = 0;
2318 if (*f == '.') {
2319 f++;
2320 while (Py_ISDIGIT((unsigned)*f))
2321 precision = (precision*10) + *f++ - '0';
2322 if (*f == '%') {
2323 /* "%.3%s" => f points to "3" */
2324 f--;
2325 }
2326 }
2327 if (*f == '\0') {
2328 /* bogus format "%.1" => go backward, f points to "1" */
2329 f--;
2330 }
2331 if (p_width != NULL)
2332 *p_width = width;
2333 if (p_precision != NULL)
2334 *p_precision = precision;
2335
2336 /* Handle %ld, %lu, %lld and %llu. */
2337 longflag = 0;
2338 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002339 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002340
2341 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002342 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002343 longflag = 1;
2344 ++f;
2345 }
2346#ifdef HAVE_LONG_LONG
2347 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002348 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002349 longlongflag = 1;
2350 f += 2;
2351 }
2352#endif
2353 }
2354 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002355 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002356 size_tflag = 1;
2357 ++f;
2358 }
2359 if (p_longflag != NULL)
2360 *p_longflag = longflag;
2361 if (p_longlongflag != NULL)
2362 *p_longlongflag = longlongflag;
2363 if (p_size_tflag != NULL)
2364 *p_size_tflag = size_tflag;
2365 return f;
2366}
2367
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002368/* maximum number of characters required for output of %ld. 21 characters
2369 allows for 64-bit integers (in decimal) and an optional sign. */
2370#define MAX_LONG_CHARS 21
2371/* maximum number of characters required for output of %lld.
2372 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2373 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2374#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2375
Walter Dörwaldd2034312007-05-18 16:29:38 +00002376PyObject *
2377PyUnicode_FromFormatV(const char *format, va_list vargs)
2378{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002379 va_list count;
2380 Py_ssize_t callcount = 0;
2381 PyObject **callresults = NULL;
2382 PyObject **callresult = NULL;
2383 Py_ssize_t n = 0;
2384 int width = 0;
2385 int precision = 0;
2386 int zeropad;
2387 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002388 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002389 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002390 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002391 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2392 Py_UCS4 argmaxchar;
2393 Py_ssize_t numbersize = 0;
2394 char *numberresults = NULL;
2395 char *numberresult = NULL;
2396 Py_ssize_t i;
2397 int kind;
2398 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002399
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002400 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002401 /* step 1: count the number of %S/%R/%A/%s format specifications
2402 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2403 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002404 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002405 * also estimate a upper bound for all the number formats in the string,
2406 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002407 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002408 for (f = format; *f; f++) {
2409 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002410 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002411 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2412 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2413 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2414 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002415
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002416 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002417#ifdef HAVE_LONG_LONG
2418 if (longlongflag) {
2419 if (width < MAX_LONG_LONG_CHARS)
2420 width = MAX_LONG_LONG_CHARS;
2421 }
2422 else
2423#endif
2424 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2425 including sign. Decimal takes the most space. This
2426 isn't enough for octal. If a width is specified we
2427 need more (which we allocate later). */
2428 if (width < MAX_LONG_CHARS)
2429 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002430
2431 /* account for the size + '\0' to separate numbers
2432 inside of the numberresults buffer */
2433 numbersize += (width + 1);
2434 }
2435 }
2436 else if ((unsigned char)*f > 127) {
2437 PyErr_Format(PyExc_ValueError,
2438 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2439 "string, got a non-ASCII byte: 0x%02x",
2440 (unsigned char)*f);
2441 return NULL;
2442 }
2443 }
2444 /* step 2: allocate memory for the results of
2445 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2446 if (callcount) {
2447 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2448 if (!callresults) {
2449 PyErr_NoMemory();
2450 return NULL;
2451 }
2452 callresult = callresults;
2453 }
2454 /* step 2.5: allocate memory for the results of formating numbers */
2455 if (numbersize) {
2456 numberresults = PyObject_Malloc(numbersize);
2457 if (!numberresults) {
2458 PyErr_NoMemory();
2459 goto fail;
2460 }
2461 numberresult = numberresults;
2462 }
2463
2464 /* step 3: format numbers and figure out how large a buffer we need */
2465 for (f = format; *f; f++) {
2466 if (*f == '%') {
2467 const char* p;
2468 int longflag;
2469 int longlongflag;
2470 int size_tflag;
2471 int numprinted;
2472
2473 p = f;
2474 zeropad = (f[1] == '0');
2475 f = parse_format_flags(f, &width, &precision,
2476 &longflag, &longlongflag, &size_tflag);
2477 switch (*f) {
2478 case 'c':
2479 {
2480 Py_UCS4 ordinal = va_arg(count, int);
Victor Stinnere6abb482012-05-02 01:15:40 +02002481 maxchar = MAX_MAXCHAR(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002482 n++;
2483 break;
2484 }
2485 case '%':
2486 n++;
2487 break;
2488 case 'i':
2489 case 'd':
2490 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2491 width, precision, *f);
2492 if (longflag)
2493 numprinted = sprintf(numberresult, fmt,
2494 va_arg(count, long));
2495#ifdef HAVE_LONG_LONG
2496 else if (longlongflag)
2497 numprinted = sprintf(numberresult, fmt,
2498 va_arg(count, PY_LONG_LONG));
2499#endif
2500 else if (size_tflag)
2501 numprinted = sprintf(numberresult, fmt,
2502 va_arg(count, Py_ssize_t));
2503 else
2504 numprinted = sprintf(numberresult, fmt,
2505 va_arg(count, int));
2506 n += numprinted;
2507 /* advance by +1 to skip over the '\0' */
2508 numberresult += (numprinted + 1);
2509 assert(*(numberresult - 1) == '\0');
2510 assert(*(numberresult - 2) != '\0');
2511 assert(numprinted >= 0);
2512 assert(numberresult <= numberresults + numbersize);
2513 break;
2514 case 'u':
2515 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2516 width, precision, 'u');
2517 if (longflag)
2518 numprinted = sprintf(numberresult, fmt,
2519 va_arg(count, unsigned long));
2520#ifdef HAVE_LONG_LONG
2521 else if (longlongflag)
2522 numprinted = sprintf(numberresult, fmt,
2523 va_arg(count, unsigned PY_LONG_LONG));
2524#endif
2525 else if (size_tflag)
2526 numprinted = sprintf(numberresult, fmt,
2527 va_arg(count, size_t));
2528 else
2529 numprinted = sprintf(numberresult, fmt,
2530 va_arg(count, unsigned int));
2531 n += numprinted;
2532 numberresult += (numprinted + 1);
2533 assert(*(numberresult - 1) == '\0');
2534 assert(*(numberresult - 2) != '\0');
2535 assert(numprinted >= 0);
2536 assert(numberresult <= numberresults + numbersize);
2537 break;
2538 case 'x':
2539 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2540 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2541 n += numprinted;
2542 numberresult += (numprinted + 1);
2543 assert(*(numberresult - 1) == '\0');
2544 assert(*(numberresult - 2) != '\0');
2545 assert(numprinted >= 0);
2546 assert(numberresult <= numberresults + numbersize);
2547 break;
2548 case 'p':
2549 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2550 /* %p is ill-defined: ensure leading 0x. */
2551 if (numberresult[1] == 'X')
2552 numberresult[1] = 'x';
2553 else if (numberresult[1] != 'x') {
2554 memmove(numberresult + 2, numberresult,
2555 strlen(numberresult) + 1);
2556 numberresult[0] = '0';
2557 numberresult[1] = 'x';
2558 numprinted += 2;
2559 }
2560 n += numprinted;
2561 numberresult += (numprinted + 1);
2562 assert(*(numberresult - 1) == '\0');
2563 assert(*(numberresult - 2) != '\0');
2564 assert(numprinted >= 0);
2565 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002566 break;
2567 case 's':
2568 {
2569 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002570 const char *s = va_arg(count, const char*);
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002571 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002572 if (!str)
2573 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002574 /* since PyUnicode_DecodeUTF8 returns already flexible
2575 unicode objects, there is no need to call ready on them */
2576 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Victor Stinnere6abb482012-05-02 01:15:40 +02002577 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002578 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002579 /* Remember the str and switch to the next slot */
2580 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002581 break;
2582 }
2583 case 'U':
2584 {
2585 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002586 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002587 if (PyUnicode_READY(obj) == -1)
2588 goto fail;
2589 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Victor Stinnere6abb482012-05-02 01:15:40 +02002590 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002591 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002592 break;
2593 }
2594 case 'V':
2595 {
2596 PyObject *obj = va_arg(count, PyObject *);
2597 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002598 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002599 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002600 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002601 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002602 if (PyUnicode_READY(obj) == -1)
2603 goto fail;
2604 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Victor Stinnere6abb482012-05-02 01:15:40 +02002605 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002606 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002607 *callresult++ = NULL;
2608 }
2609 else {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002610 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002611 if (!str_obj)
2612 goto fail;
Benjamin Petersonbac79492012-01-14 13:34:47 -05002613 if (PyUnicode_READY(str_obj) == -1) {
Victor Stinnere1335c72011-10-04 20:53:03 +02002614 Py_DECREF(str_obj);
2615 goto fail;
2616 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002617 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Victor Stinnere6abb482012-05-02 01:15:40 +02002618 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002619 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002620 *callresult++ = str_obj;
2621 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002622 break;
2623 }
2624 case 'S':
2625 {
2626 PyObject *obj = va_arg(count, PyObject *);
2627 PyObject *str;
2628 assert(obj);
2629 str = PyObject_Str(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002630 if (!str)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002631 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002632 if (PyUnicode_READY(str) == -1) {
2633 Py_DECREF(str);
2634 goto fail;
2635 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002636 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Victor Stinnere6abb482012-05-02 01:15:40 +02002637 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002638 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002639 /* Remember the str and switch to the next slot */
2640 *callresult++ = str;
2641 break;
2642 }
2643 case 'R':
2644 {
2645 PyObject *obj = va_arg(count, PyObject *);
2646 PyObject *repr;
2647 assert(obj);
2648 repr = PyObject_Repr(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002649 if (!repr)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002650 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002651 if (PyUnicode_READY(repr) == -1) {
2652 Py_DECREF(repr);
2653 goto fail;
2654 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002655 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Victor Stinnere6abb482012-05-02 01:15:40 +02002656 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002657 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002658 /* Remember the repr and switch to the next slot */
2659 *callresult++ = repr;
2660 break;
2661 }
2662 case 'A':
2663 {
2664 PyObject *obj = va_arg(count, PyObject *);
2665 PyObject *ascii;
2666 assert(obj);
2667 ascii = PyObject_ASCII(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002668 if (!ascii)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002669 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002670 if (PyUnicode_READY(ascii) == -1) {
2671 Py_DECREF(ascii);
2672 goto fail;
2673 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002674 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Victor Stinnere6abb482012-05-02 01:15:40 +02002675 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002676 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002677 /* Remember the repr and switch to the next slot */
2678 *callresult++ = ascii;
2679 break;
2680 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002681 default:
2682 /* if we stumble upon an unknown
2683 formatting code, copy the rest of
2684 the format string to the output
2685 string. (we cannot just skip the
2686 code, since there's no way to know
2687 what's in the argument list) */
2688 n += strlen(p);
2689 goto expand;
2690 }
2691 } else
2692 n++;
2693 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002694 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002695 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002696 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002697 we don't have to resize the string.
2698 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002699 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002700 if (!string)
2701 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002702 kind = PyUnicode_KIND(string);
2703 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002704 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002705 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002706
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002707 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002708 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002709 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002710
2711 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002712 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2713 /* checking for == because the last argument could be a empty
2714 string, which causes i to point to end, the assert at the end of
2715 the loop */
2716 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002717
Benjamin Peterson14339b62009-01-31 16:36:08 +00002718 switch (*f) {
2719 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002720 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002721 const int ordinal = va_arg(vargs, int);
2722 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002723 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002724 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002725 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002726 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002727 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002728 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002729 case 'p':
Victor Stinnerc5166102012-02-22 13:55:02 +01002730 {
2731 Py_ssize_t written;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002732 /* unused, since we already have the result */
2733 if (*f == 'p')
2734 (void) va_arg(vargs, void *);
2735 else
2736 (void) va_arg(vargs, int);
2737 /* extract the result from numberresults and append. */
Victor Stinnerc5166102012-02-22 13:55:02 +01002738 written = unicode_write_cstr(string, i, numberresult);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002739 /* skip over the separating '\0' */
Victor Stinnerc5166102012-02-22 13:55:02 +01002740 i += written;
2741 numberresult += written;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002742 assert(*numberresult == '\0');
2743 numberresult++;
2744 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002745 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002746 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002747 case 's':
2748 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002749 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002750 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002751 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002752 size = PyUnicode_GET_LENGTH(*callresult);
2753 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002754 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002755 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002756 /* We're done with the unicode()/repr() => forget it */
2757 Py_DECREF(*callresult);
2758 /* switch to next unicode()/repr() result */
2759 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002760 break;
2761 }
2762 case 'U':
2763 {
2764 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002765 Py_ssize_t size;
2766 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2767 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002768 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002769 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002770 break;
2771 }
2772 case 'V':
2773 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002774 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002775 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002776 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002777 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002778 size = PyUnicode_GET_LENGTH(obj);
2779 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002780 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002781 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002782 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002783 size = PyUnicode_GET_LENGTH(*callresult);
2784 assert(PyUnicode_KIND(*callresult) <=
2785 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002786 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002787 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002788 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002789 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002790 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002791 break;
2792 }
2793 case 'S':
2794 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002795 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002796 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002797 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002798 /* unused, since we already have the result */
2799 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002800 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002801 copy_characters(string, i, *callresult, 0, size);
2802 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002803 /* We're done with the unicode()/repr() => forget it */
2804 Py_DECREF(*callresult);
2805 /* switch to next unicode()/repr() result */
2806 ++callresult;
2807 break;
2808 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002809 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002810 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002811 break;
2812 default:
Victor Stinnerc5166102012-02-22 13:55:02 +01002813 i += unicode_write_cstr(string, i, p);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002814 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002815 goto end;
2816 }
Victor Stinner1205f272010-09-11 00:54:47 +00002817 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002818 else {
2819 assert(i < PyUnicode_GET_LENGTH(string));
2820 PyUnicode_WRITE(kind, data, i++, *f);
2821 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002822 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002823 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002824
Benjamin Peterson29060642009-01-31 22:14:21 +00002825 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002826 if (callresults)
2827 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002828 if (numberresults)
2829 PyObject_Free(numberresults);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002830 return unicode_result(string);
Benjamin Peterson29060642009-01-31 22:14:21 +00002831 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002832 if (callresults) {
2833 PyObject **callresult2 = callresults;
2834 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002835 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002836 ++callresult2;
2837 }
2838 PyObject_Free(callresults);
2839 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002840 if (numberresults)
2841 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002842 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002843}
2844
Walter Dörwaldd2034312007-05-18 16:29:38 +00002845PyObject *
2846PyUnicode_FromFormat(const char *format, ...)
2847{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002848 PyObject* ret;
2849 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002850
2851#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002852 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002853#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002854 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002855#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002856 ret = PyUnicode_FromFormatV(format, vargs);
2857 va_end(vargs);
2858 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002859}
2860
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002861#ifdef HAVE_WCHAR_H
2862
Victor Stinner5593d8a2010-10-02 11:11:27 +00002863/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2864 convert a Unicode object to a wide character string.
2865
Victor Stinnerd88d9832011-09-06 02:00:05 +02002866 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002867 character) required to convert the unicode object. Ignore size argument.
2868
Victor Stinnerd88d9832011-09-06 02:00:05 +02002869 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002870 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002871 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002872static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002873unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002874 wchar_t *w,
2875 Py_ssize_t size)
2876{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002877 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002878 const wchar_t *wstr;
2879
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002880 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002881 if (wstr == NULL)
2882 return -1;
2883
Victor Stinner5593d8a2010-10-02 11:11:27 +00002884 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002885 if (size > res)
2886 size = res + 1;
2887 else
2888 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002889 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002890 return res;
2891 }
2892 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002893 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002894}
2895
2896Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002897PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002898 wchar_t *w,
2899 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002900{
2901 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002902 PyErr_BadInternalCall();
2903 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002904 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002905 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002906}
2907
Victor Stinner137c34c2010-09-29 10:25:54 +00002908wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002909PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002910 Py_ssize_t *size)
2911{
2912 wchar_t* buffer;
2913 Py_ssize_t buflen;
2914
2915 if (unicode == NULL) {
2916 PyErr_BadInternalCall();
2917 return NULL;
2918 }
2919
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002920 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002921 if (buflen == -1)
2922 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002923 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002924 PyErr_NoMemory();
2925 return NULL;
2926 }
2927
Victor Stinner137c34c2010-09-29 10:25:54 +00002928 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2929 if (buffer == NULL) {
2930 PyErr_NoMemory();
2931 return NULL;
2932 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002933 buflen = unicode_aswidechar(unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002934 if (buflen == -1)
2935 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002936 if (size != NULL)
2937 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002938 return buffer;
2939}
2940
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002941#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002942
Alexander Belopolsky40018472011-02-26 01:02:56 +00002943PyObject *
2944PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002945{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002946 PyObject *v;
Victor Stinner8faf8212011-12-08 22:14:11 +01002947 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002948 PyErr_SetString(PyExc_ValueError,
2949 "chr() arg not in range(0x110000)");
2950 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002951 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002952
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002953 if (ordinal < 256)
2954 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002955
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002956 v = PyUnicode_New(1, ordinal);
2957 if (v == NULL)
2958 return NULL;
2959 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002960 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002961 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002962}
2963
Alexander Belopolsky40018472011-02-26 01:02:56 +00002964PyObject *
2965PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002966{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002967 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002968 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002969 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002970 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002971 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002972 Py_INCREF(obj);
2973 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002974 }
2975 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002976 /* For a Unicode subtype that's not a Unicode object,
2977 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002978 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002979 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002980 PyErr_Format(PyExc_TypeError,
2981 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002982 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002983 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002984}
2985
Alexander Belopolsky40018472011-02-26 01:02:56 +00002986PyObject *
2987PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002988 const char *encoding,
2989 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002990{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002991 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002992 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002993
Guido van Rossumd57fd912000-03-10 22:53:23 +00002994 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002995 PyErr_BadInternalCall();
2996 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002997 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002998
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002999 /* Decoding bytes objects is the most common case and should be fast */
3000 if (PyBytes_Check(obj)) {
3001 if (PyBytes_GET_SIZE(obj) == 0) {
3002 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02003003 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003004 }
3005 else {
3006 v = PyUnicode_Decode(
3007 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3008 encoding, errors);
3009 }
3010 return v;
3011 }
3012
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003013 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003014 PyErr_SetString(PyExc_TypeError,
3015 "decoding str is not supported");
3016 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003017 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003018
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003019 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3020 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3021 PyErr_Format(PyExc_TypeError,
3022 "coercing to str: need bytes, bytearray "
3023 "or buffer-like object, %.80s found",
3024 Py_TYPE(obj)->tp_name);
3025 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003026 }
Tim Petersced69f82003-09-16 20:30:58 +00003027
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003028 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003029 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02003030 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003031 }
Tim Petersced69f82003-09-16 20:30:58 +00003032 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003033 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003034
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003035 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003036 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003037}
3038
Victor Stinner600d3be2010-06-10 12:00:55 +00003039/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00003040 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
3041 1 on success. */
3042static int
3043normalize_encoding(const char *encoding,
3044 char *lower,
3045 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003046{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003047 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003048 char *l;
3049 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003050
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04003051 if (encoding == NULL) {
3052 strcpy(lower, "utf-8");
3053 return 1;
3054 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003055 e = encoding;
3056 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003057 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00003058 while (*e) {
3059 if (l == l_end)
3060 return 0;
David Malcolm96960882010-11-05 17:23:41 +00003061 if (Py_ISUPPER(*e)) {
3062 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003063 }
3064 else if (*e == '_') {
3065 *l++ = '-';
3066 e++;
3067 }
3068 else {
3069 *l++ = *e++;
3070 }
3071 }
3072 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003073 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003074}
3075
Alexander Belopolsky40018472011-02-26 01:02:56 +00003076PyObject *
3077PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003078 Py_ssize_t size,
3079 const char *encoding,
3080 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003081{
3082 PyObject *buffer = NULL, *unicode;
3083 Py_buffer info;
3084 char lower[11]; /* Enough for any encoding shortcut */
3085
Fred Drakee4315f52000-05-09 19:53:39 +00003086 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003087 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003088 if ((strcmp(lower, "utf-8") == 0) ||
3089 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003090 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00003091 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003092 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003093 (strcmp(lower, "iso-8859-1") == 0))
3094 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003095#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003096 else if (strcmp(lower, "mbcs") == 0)
3097 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003098#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003099 else if (strcmp(lower, "ascii") == 0)
3100 return PyUnicode_DecodeASCII(s, size, errors);
3101 else if (strcmp(lower, "utf-16") == 0)
3102 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3103 else if (strcmp(lower, "utf-32") == 0)
3104 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3105 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003106
3107 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003108 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003109 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003110 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003111 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003112 if (buffer == NULL)
3113 goto onError;
3114 unicode = PyCodec_Decode(buffer, encoding, errors);
3115 if (unicode == NULL)
3116 goto onError;
3117 if (!PyUnicode_Check(unicode)) {
3118 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003119 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00003120 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003121 Py_DECREF(unicode);
3122 goto onError;
3123 }
3124 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003125 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003126
Benjamin Peterson29060642009-01-31 22:14:21 +00003127 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003128 Py_XDECREF(buffer);
3129 return NULL;
3130}
3131
Alexander Belopolsky40018472011-02-26 01:02:56 +00003132PyObject *
3133PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003134 const char *encoding,
3135 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003136{
3137 PyObject *v;
3138
3139 if (!PyUnicode_Check(unicode)) {
3140 PyErr_BadArgument();
3141 goto onError;
3142 }
3143
3144 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003145 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003146
3147 /* Decode via the codec registry */
3148 v = PyCodec_Decode(unicode, encoding, errors);
3149 if (v == NULL)
3150 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003151 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003152
Benjamin Peterson29060642009-01-31 22:14:21 +00003153 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003154 return NULL;
3155}
3156
Alexander Belopolsky40018472011-02-26 01:02:56 +00003157PyObject *
3158PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003159 const char *encoding,
3160 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003161{
3162 PyObject *v;
3163
3164 if (!PyUnicode_Check(unicode)) {
3165 PyErr_BadArgument();
3166 goto onError;
3167 }
3168
3169 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003170 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003171
3172 /* Decode via the codec registry */
3173 v = PyCodec_Decode(unicode, encoding, errors);
3174 if (v == NULL)
3175 goto onError;
3176 if (!PyUnicode_Check(v)) {
3177 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003178 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003179 Py_TYPE(v)->tp_name);
3180 Py_DECREF(v);
3181 goto onError;
3182 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003183 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003184
Benjamin Peterson29060642009-01-31 22:14:21 +00003185 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003186 return NULL;
3187}
3188
Alexander Belopolsky40018472011-02-26 01:02:56 +00003189PyObject *
3190PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003191 Py_ssize_t size,
3192 const char *encoding,
3193 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003194{
3195 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003196
Guido van Rossumd57fd912000-03-10 22:53:23 +00003197 unicode = PyUnicode_FromUnicode(s, size);
3198 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003199 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003200 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3201 Py_DECREF(unicode);
3202 return v;
3203}
3204
Alexander Belopolsky40018472011-02-26 01:02:56 +00003205PyObject *
3206PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003207 const char *encoding,
3208 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003209{
3210 PyObject *v;
3211
3212 if (!PyUnicode_Check(unicode)) {
3213 PyErr_BadArgument();
3214 goto onError;
3215 }
3216
3217 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003218 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003219
3220 /* Encode via the codec registry */
3221 v = PyCodec_Encode(unicode, encoding, errors);
3222 if (v == NULL)
3223 goto onError;
3224 return v;
3225
Benjamin Peterson29060642009-01-31 22:14:21 +00003226 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003227 return NULL;
3228}
3229
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003230static size_t
3231wcstombs_errorpos(const wchar_t *wstr)
3232{
3233 size_t len;
3234#if SIZEOF_WCHAR_T == 2
3235 wchar_t buf[3];
3236#else
3237 wchar_t buf[2];
3238#endif
3239 char outbuf[MB_LEN_MAX];
3240 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003241
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003242#if SIZEOF_WCHAR_T == 2
3243 buf[2] = 0;
3244#else
3245 buf[1] = 0;
3246#endif
3247 start = wstr;
3248 while (*wstr != L'\0')
3249 {
3250 previous = wstr;
3251#if SIZEOF_WCHAR_T == 2
3252 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3253 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3254 {
3255 buf[0] = wstr[0];
3256 buf[1] = wstr[1];
3257 wstr += 2;
3258 }
3259 else {
3260 buf[0] = *wstr;
3261 buf[1] = 0;
3262 wstr++;
3263 }
3264#else
3265 buf[0] = *wstr;
3266 wstr++;
3267#endif
3268 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003269 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003270 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003271 }
3272
3273 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003274 return 0;
3275}
3276
Victor Stinner1b579672011-12-17 05:47:23 +01003277static int
3278locale_error_handler(const char *errors, int *surrogateescape)
3279{
3280 if (errors == NULL) {
3281 *surrogateescape = 0;
3282 return 0;
3283 }
3284
3285 if (strcmp(errors, "strict") == 0) {
3286 *surrogateescape = 0;
3287 return 0;
3288 }
3289 if (strcmp(errors, "surrogateescape") == 0) {
3290 *surrogateescape = 1;
3291 return 0;
3292 }
3293 PyErr_Format(PyExc_ValueError,
3294 "only 'strict' and 'surrogateescape' error handlers "
3295 "are supported, not '%s'",
3296 errors);
3297 return -1;
3298}
3299
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003300PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003301PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003302{
3303 Py_ssize_t wlen, wlen2;
3304 wchar_t *wstr;
3305 PyObject *bytes = NULL;
3306 char *errmsg;
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003307 PyObject *reason;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003308 PyObject *exc;
3309 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003310 int surrogateescape;
3311
3312 if (locale_error_handler(errors, &surrogateescape) < 0)
3313 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003314
3315 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3316 if (wstr == NULL)
3317 return NULL;
3318
3319 wlen2 = wcslen(wstr);
3320 if (wlen2 != wlen) {
3321 PyMem_Free(wstr);
3322 PyErr_SetString(PyExc_TypeError, "embedded null character");
3323 return NULL;
3324 }
3325
3326 if (surrogateescape) {
3327 /* locale encoding with surrogateescape */
3328 char *str;
3329
3330 str = _Py_wchar2char(wstr, &error_pos);
3331 if (str == NULL) {
3332 if (error_pos == (size_t)-1) {
3333 PyErr_NoMemory();
3334 PyMem_Free(wstr);
3335 return NULL;
3336 }
3337 else {
3338 goto encode_error;
3339 }
3340 }
3341 PyMem_Free(wstr);
3342
3343 bytes = PyBytes_FromString(str);
3344 PyMem_Free(str);
3345 }
3346 else {
3347 size_t len, len2;
3348
3349 len = wcstombs(NULL, wstr, 0);
3350 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003351 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003352 goto encode_error;
3353 }
3354
3355 bytes = PyBytes_FromStringAndSize(NULL, len);
3356 if (bytes == NULL) {
3357 PyMem_Free(wstr);
3358 return NULL;
3359 }
3360
3361 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3362 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003363 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003364 goto encode_error;
3365 }
3366 PyMem_Free(wstr);
3367 }
3368 return bytes;
3369
3370encode_error:
3371 errmsg = strerror(errno);
3372 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003373
3374 if (error_pos == (size_t)-1)
3375 error_pos = wcstombs_errorpos(wstr);
3376
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003377 PyMem_Free(wstr);
3378 Py_XDECREF(bytes);
3379
Victor Stinner2f197072011-12-17 07:08:30 +01003380 if (errmsg != NULL) {
3381 size_t errlen;
3382 wstr = _Py_char2wchar(errmsg, &errlen);
3383 if (wstr != NULL) {
3384 reason = PyUnicode_FromWideChar(wstr, errlen);
3385 PyMem_Free(wstr);
3386 } else
3387 errmsg = NULL;
3388 }
3389 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003390 reason = PyUnicode_FromString(
3391 "wcstombs() encountered an unencodable "
3392 "wide character");
3393 if (reason == NULL)
3394 return NULL;
3395
3396 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3397 "locale", unicode,
3398 (Py_ssize_t)error_pos,
3399 (Py_ssize_t)(error_pos+1),
3400 reason);
3401 Py_DECREF(reason);
3402 if (exc != NULL) {
3403 PyCodec_StrictErrors(exc);
3404 Py_XDECREF(exc);
3405 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003406 return NULL;
3407}
3408
Victor Stinnerad158722010-10-27 00:25:46 +00003409PyObject *
3410PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003411{
Victor Stinner99b95382011-07-04 14:23:54 +02003412#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003413 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003414#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003415 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003416#else
Victor Stinner793b5312011-04-27 00:24:21 +02003417 PyInterpreterState *interp = PyThreadState_GET()->interp;
3418 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3419 cannot use it to encode and decode filenames before it is loaded. Load
3420 the Python codec requires to encode at least its own filename. Use the C
3421 version of the locale codec until the codec registry is initialized and
3422 the Python codec is loaded.
3423
3424 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3425 cannot only rely on it: check also interp->fscodec_initialized for
3426 subinterpreters. */
3427 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003428 return PyUnicode_AsEncodedString(unicode,
3429 Py_FileSystemDefaultEncoding,
3430 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003431 }
3432 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003433 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003434 }
Victor Stinnerad158722010-10-27 00:25:46 +00003435#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003436}
3437
Alexander Belopolsky40018472011-02-26 01:02:56 +00003438PyObject *
3439PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003440 const char *encoding,
3441 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003442{
3443 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003444 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003445
Guido van Rossumd57fd912000-03-10 22:53:23 +00003446 if (!PyUnicode_Check(unicode)) {
3447 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003448 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003449 }
Fred Drakee4315f52000-05-09 19:53:39 +00003450
Fred Drakee4315f52000-05-09 19:53:39 +00003451 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003452 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003453 if ((strcmp(lower, "utf-8") == 0) ||
3454 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003455 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003456 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003457 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003458 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003459 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003460 }
Victor Stinner37296e82010-06-10 13:36:23 +00003461 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003462 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003463 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003464 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003465#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003466 else if (strcmp(lower, "mbcs") == 0)
3467 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003468#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003469 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003470 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003471 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003472
3473 /* Encode via the codec registry */
3474 v = PyCodec_Encode(unicode, encoding, errors);
3475 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003476 return NULL;
3477
3478 /* The normal path */
3479 if (PyBytes_Check(v))
3480 return v;
3481
3482 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003483 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003484 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003485 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003486
3487 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3488 "encoder %s returned bytearray instead of bytes",
3489 encoding);
3490 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003491 Py_DECREF(v);
3492 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003493 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003494
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003495 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3496 Py_DECREF(v);
3497 return b;
3498 }
3499
3500 PyErr_Format(PyExc_TypeError,
3501 "encoder did not return a bytes object (type=%.400s)",
3502 Py_TYPE(v)->tp_name);
3503 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003504 return NULL;
3505}
3506
Alexander Belopolsky40018472011-02-26 01:02:56 +00003507PyObject *
3508PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003509 const char *encoding,
3510 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003511{
3512 PyObject *v;
3513
3514 if (!PyUnicode_Check(unicode)) {
3515 PyErr_BadArgument();
3516 goto onError;
3517 }
3518
3519 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003520 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003521
3522 /* Encode via the codec registry */
3523 v = PyCodec_Encode(unicode, encoding, errors);
3524 if (v == NULL)
3525 goto onError;
3526 if (!PyUnicode_Check(v)) {
3527 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003528 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003529 Py_TYPE(v)->tp_name);
3530 Py_DECREF(v);
3531 goto onError;
3532 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003533 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003534
Benjamin Peterson29060642009-01-31 22:14:21 +00003535 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003536 return NULL;
3537}
3538
Victor Stinner2f197072011-12-17 07:08:30 +01003539static size_t
3540mbstowcs_errorpos(const char *str, size_t len)
3541{
3542#ifdef HAVE_MBRTOWC
3543 const char *start = str;
3544 mbstate_t mbs;
3545 size_t converted;
3546 wchar_t ch;
3547
3548 memset(&mbs, 0, sizeof mbs);
3549 while (len)
3550 {
3551 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3552 if (converted == 0)
3553 /* Reached end of string */
3554 break;
3555 if (converted == (size_t)-1 || converted == (size_t)-2) {
3556 /* Conversion error or incomplete character */
3557 return str - start;
3558 }
3559 else {
3560 str += converted;
3561 len -= converted;
3562 }
3563 }
3564 /* failed to find the undecodable byte sequence */
3565 return 0;
3566#endif
3567 return 0;
3568}
3569
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003570PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003571PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003572 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003573{
3574 wchar_t smallbuf[256];
3575 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3576 wchar_t *wstr;
3577 size_t wlen, wlen2;
3578 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003579 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003580 size_t error_pos;
3581 char *errmsg;
3582 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003583
3584 if (locale_error_handler(errors, &surrogateescape) < 0)
3585 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003586
3587 if (str[len] != '\0' || len != strlen(str)) {
3588 PyErr_SetString(PyExc_TypeError, "embedded null character");
3589 return NULL;
3590 }
3591
3592 if (surrogateescape)
3593 {
3594 wstr = _Py_char2wchar(str, &wlen);
3595 if (wstr == NULL) {
3596 if (wlen == (size_t)-1)
3597 PyErr_NoMemory();
3598 else
3599 PyErr_SetFromErrno(PyExc_OSError);
3600 return NULL;
3601 }
3602
3603 unicode = PyUnicode_FromWideChar(wstr, wlen);
3604 PyMem_Free(wstr);
3605 }
3606 else {
3607#ifndef HAVE_BROKEN_MBSTOWCS
3608 wlen = mbstowcs(NULL, str, 0);
3609#else
3610 wlen = len;
3611#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003612 if (wlen == (size_t)-1)
3613 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003614 if (wlen+1 <= smallbuf_len) {
3615 wstr = smallbuf;
3616 }
3617 else {
3618 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3619 return PyErr_NoMemory();
3620
3621 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3622 if (!wstr)
3623 return PyErr_NoMemory();
3624 }
3625
3626 /* This shouldn't fail now */
3627 wlen2 = mbstowcs(wstr, str, wlen+1);
3628 if (wlen2 == (size_t)-1) {
3629 if (wstr != smallbuf)
3630 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003631 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003632 }
3633#ifdef HAVE_BROKEN_MBSTOWCS
3634 assert(wlen2 == wlen);
3635#endif
3636 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3637 if (wstr != smallbuf)
3638 PyMem_Free(wstr);
3639 }
3640 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003641
3642decode_error:
3643 errmsg = strerror(errno);
3644 assert(errmsg != NULL);
3645
3646 error_pos = mbstowcs_errorpos(str, len);
3647 if (errmsg != NULL) {
3648 size_t errlen;
3649 wstr = _Py_char2wchar(errmsg, &errlen);
3650 if (wstr != NULL) {
3651 reason = PyUnicode_FromWideChar(wstr, errlen);
3652 PyMem_Free(wstr);
3653 } else
3654 errmsg = NULL;
3655 }
3656 if (errmsg == NULL)
3657 reason = PyUnicode_FromString(
3658 "mbstowcs() encountered an invalid multibyte sequence");
3659 if (reason == NULL)
3660 return NULL;
3661
3662 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3663 "locale", str, len,
3664 (Py_ssize_t)error_pos,
3665 (Py_ssize_t)(error_pos+1),
3666 reason);
3667 Py_DECREF(reason);
3668 if (exc != NULL) {
3669 PyCodec_StrictErrors(exc);
3670 Py_XDECREF(exc);
3671 }
3672 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003673}
3674
3675PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003676PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003677{
3678 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003679 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003680}
3681
3682
3683PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003684PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003685 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003686 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3687}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003688
Christian Heimes5894ba72007-11-04 11:43:14 +00003689PyObject*
3690PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3691{
Victor Stinner99b95382011-07-04 14:23:54 +02003692#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003693 return PyUnicode_DecodeMBCS(s, size, NULL);
3694#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003695 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003696#else
Victor Stinner793b5312011-04-27 00:24:21 +02003697 PyInterpreterState *interp = PyThreadState_GET()->interp;
3698 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3699 cannot use it to encode and decode filenames before it is loaded. Load
3700 the Python codec requires to encode at least its own filename. Use the C
3701 version of the locale codec until the codec registry is initialized and
3702 the Python codec is loaded.
3703
3704 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3705 cannot only rely on it: check also interp->fscodec_initialized for
3706 subinterpreters. */
3707 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003708 return PyUnicode_Decode(s, size,
3709 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003710 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003711 }
3712 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003713 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003714 }
Victor Stinnerad158722010-10-27 00:25:46 +00003715#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003716}
3717
Martin v. Löwis011e8422009-05-05 04:43:17 +00003718
3719int
Antoine Pitrou13348842012-01-29 18:36:34 +01003720_PyUnicode_HasNULChars(PyObject* s)
3721{
3722 static PyObject *nul = NULL;
3723
3724 if (nul == NULL)
3725 nul = PyUnicode_FromStringAndSize("\0", 1);
3726 if (nul == NULL)
3727 return -1;
3728 return PyUnicode_Contains(s, nul);
3729}
3730
3731
3732int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003733PyUnicode_FSConverter(PyObject* arg, void* addr)
3734{
3735 PyObject *output = NULL;
3736 Py_ssize_t size;
3737 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003738 if (arg == NULL) {
3739 Py_DECREF(*(PyObject**)addr);
3740 return 1;
3741 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003742 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003743 output = arg;
3744 Py_INCREF(output);
3745 }
3746 else {
3747 arg = PyUnicode_FromObject(arg);
3748 if (!arg)
3749 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003750 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003751 Py_DECREF(arg);
3752 if (!output)
3753 return 0;
3754 if (!PyBytes_Check(output)) {
3755 Py_DECREF(output);
3756 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3757 return 0;
3758 }
3759 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003760 size = PyBytes_GET_SIZE(output);
3761 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003762 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003763 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003764 Py_DECREF(output);
3765 return 0;
3766 }
3767 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003768 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003769}
3770
3771
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003772int
3773PyUnicode_FSDecoder(PyObject* arg, void* addr)
3774{
3775 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003776 if (arg == NULL) {
3777 Py_DECREF(*(PyObject**)addr);
3778 return 1;
3779 }
3780 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003781 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003782 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003783 output = arg;
3784 Py_INCREF(output);
3785 }
3786 else {
3787 arg = PyBytes_FromObject(arg);
3788 if (!arg)
3789 return 0;
3790 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3791 PyBytes_GET_SIZE(arg));
3792 Py_DECREF(arg);
3793 if (!output)
3794 return 0;
3795 if (!PyUnicode_Check(output)) {
3796 Py_DECREF(output);
3797 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3798 return 0;
3799 }
3800 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003801 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003802 Py_DECREF(output);
3803 return 0;
3804 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003805 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003806 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003807 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3808 Py_DECREF(output);
3809 return 0;
3810 }
3811 *(PyObject**)addr = output;
3812 return Py_CLEANUP_SUPPORTED;
3813}
3814
3815
Martin v. Löwis5b222132007-06-10 09:51:05 +00003816char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003817PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003818{
Christian Heimesf3863112007-11-22 07:46:41 +00003819 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003820
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003821 if (!PyUnicode_Check(unicode)) {
3822 PyErr_BadArgument();
3823 return NULL;
3824 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003825 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003826 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003827
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003828 if (PyUnicode_UTF8(unicode) == NULL) {
3829 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003830 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3831 if (bytes == NULL)
3832 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003833 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3834 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003835 Py_DECREF(bytes);
3836 return NULL;
3837 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003838 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3839 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3840 PyBytes_AS_STRING(bytes),
3841 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003842 Py_DECREF(bytes);
3843 }
3844
3845 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003846 *psize = PyUnicode_UTF8_LENGTH(unicode);
3847 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003848}
3849
3850char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003851PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003852{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003853 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3854}
3855
3856#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02003857static int unicode_as_unicode_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003858#endif
3859
3860
3861Py_UNICODE *
3862PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3863{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003864 const unsigned char *one_byte;
3865#if SIZEOF_WCHAR_T == 4
3866 const Py_UCS2 *two_bytes;
3867#else
3868 const Py_UCS4 *four_bytes;
3869 const Py_UCS4 *ucs4_end;
3870 Py_ssize_t num_surrogates;
3871#endif
3872 wchar_t *w;
3873 wchar_t *wchar_end;
3874
3875 if (!PyUnicode_Check(unicode)) {
3876 PyErr_BadArgument();
3877 return NULL;
3878 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003879 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003880 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003881 assert(_PyUnicode_KIND(unicode) != 0);
3882 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003883
3884#ifdef Py_DEBUG
3885 ++unicode_as_unicode_calls;
3886#endif
3887
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003888 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003889#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003890 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3891 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003892 num_surrogates = 0;
3893
3894 for (; four_bytes < ucs4_end; ++four_bytes) {
3895 if (*four_bytes > 0xFFFF)
3896 ++num_surrogates;
3897 }
3898
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003899 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3900 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3901 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003902 PyErr_NoMemory();
3903 return NULL;
3904 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003905 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003906
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003907 w = _PyUnicode_WSTR(unicode);
3908 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3909 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003910 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3911 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003912 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003913 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003914 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3915 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003916 }
3917 else
3918 *w = *four_bytes;
3919
3920 if (w > wchar_end) {
3921 assert(0 && "Miscalculated string end");
3922 }
3923 }
3924 *w = 0;
3925#else
3926 /* sizeof(wchar_t) == 4 */
3927 Py_FatalError("Impossible unicode object state, wstr and str "
3928 "should share memory already.");
3929 return NULL;
3930#endif
3931 }
3932 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003933 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3934 (_PyUnicode_LENGTH(unicode) + 1));
3935 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003936 PyErr_NoMemory();
3937 return NULL;
3938 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003939 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3940 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3941 w = _PyUnicode_WSTR(unicode);
3942 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003943
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003944 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3945 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003946 for (; w < wchar_end; ++one_byte, ++w)
3947 *w = *one_byte;
3948 /* null-terminate the wstr */
3949 *w = 0;
3950 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003951 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003952#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003953 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003954 for (; w < wchar_end; ++two_bytes, ++w)
3955 *w = *two_bytes;
3956 /* null-terminate the wstr */
3957 *w = 0;
3958#else
3959 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003960 PyObject_FREE(_PyUnicode_WSTR(unicode));
3961 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003962 Py_FatalError("Impossible unicode object state, wstr "
3963 "and str should share memory already.");
3964 return NULL;
3965#endif
3966 }
3967 else {
3968 assert(0 && "This should never happen.");
3969 }
3970 }
3971 }
3972 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003973 *size = PyUnicode_WSTR_LENGTH(unicode);
3974 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003975}
3976
Alexander Belopolsky40018472011-02-26 01:02:56 +00003977Py_UNICODE *
3978PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003979{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003980 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003981}
3982
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003983
Alexander Belopolsky40018472011-02-26 01:02:56 +00003984Py_ssize_t
3985PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003986{
3987 if (!PyUnicode_Check(unicode)) {
3988 PyErr_BadArgument();
3989 goto onError;
3990 }
3991 return PyUnicode_GET_SIZE(unicode);
3992
Benjamin Peterson29060642009-01-31 22:14:21 +00003993 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003994 return -1;
3995}
3996
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003997Py_ssize_t
3998PyUnicode_GetLength(PyObject *unicode)
3999{
Victor Stinner5a706cf2011-10-02 00:36:53 +02004000 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004001 PyErr_BadArgument();
4002 return -1;
4003 }
4004
4005 return PyUnicode_GET_LENGTH(unicode);
4006}
4007
4008Py_UCS4
4009PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4010{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004011 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
4012 PyErr_BadArgument();
4013 return (Py_UCS4)-1;
4014 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004015 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004016 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004017 return (Py_UCS4)-1;
4018 }
4019 return PyUnicode_READ_CHAR(unicode, index);
4020}
4021
4022int
4023PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4024{
4025 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004026 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004027 return -1;
4028 }
Victor Stinner488fa492011-12-12 00:01:39 +01004029 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004030 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004031 PyErr_SetString(PyExc_IndexError, "string index out of range");
4032 return -1;
4033 }
Victor Stinner488fa492011-12-12 00:01:39 +01004034 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004035 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004036 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4037 PyErr_SetString(PyExc_ValueError, "character out of range");
4038 return -1;
4039 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004040 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4041 index, ch);
4042 return 0;
4043}
4044
Alexander Belopolsky40018472011-02-26 01:02:56 +00004045const char *
4046PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004047{
Victor Stinner42cb4622010-09-01 19:39:01 +00004048 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004049}
4050
Victor Stinner554f3f02010-06-16 23:33:54 +00004051/* create or adjust a UnicodeDecodeError */
4052static void
4053make_decode_exception(PyObject **exceptionObject,
4054 const char *encoding,
4055 const char *input, Py_ssize_t length,
4056 Py_ssize_t startpos, Py_ssize_t endpos,
4057 const char *reason)
4058{
4059 if (*exceptionObject == NULL) {
4060 *exceptionObject = PyUnicodeDecodeError_Create(
4061 encoding, input, length, startpos, endpos, reason);
4062 }
4063 else {
4064 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4065 goto onError;
4066 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4067 goto onError;
4068 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4069 goto onError;
4070 }
4071 return;
4072
4073onError:
4074 Py_DECREF(*exceptionObject);
4075 *exceptionObject = NULL;
4076}
4077
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004078/* error handling callback helper:
4079 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004080 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004081 and adjust various state variables.
4082 return 0 on success, -1 on error
4083*/
4084
Alexander Belopolsky40018472011-02-26 01:02:56 +00004085static int
4086unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004087 const char *encoding, const char *reason,
4088 const char **input, const char **inend, Py_ssize_t *startinpos,
4089 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004090 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004091{
Benjamin Peterson142957c2008-07-04 19:55:29 +00004092 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004093
4094 PyObject *restuple = NULL;
4095 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004096 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004097 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004098 Py_ssize_t requiredsize;
4099 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004100 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004101 int res = -1;
4102
Victor Stinner596a6c42011-11-09 00:02:18 +01004103 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
4104 outsize = PyUnicode_GET_LENGTH(*output);
4105 else
4106 outsize = _PyUnicode_WSTR_LENGTH(*output);
4107
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004108 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004109 *errorHandler = PyCodec_LookupError(errors);
4110 if (*errorHandler == NULL)
4111 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004112 }
4113
Victor Stinner554f3f02010-06-16 23:33:54 +00004114 make_decode_exception(exceptionObject,
4115 encoding,
4116 *input, *inend - *input,
4117 *startinpos, *endinpos,
4118 reason);
4119 if (*exceptionObject == NULL)
4120 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004121
4122 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4123 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004124 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004125 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004126 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004127 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004128 }
4129 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004130 goto onError;
Benjamin Petersonbac79492012-01-14 13:34:47 -05004131 if (PyUnicode_READY(repunicode) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004132 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004133
4134 /* Copy back the bytes variables, which might have been modified by the
4135 callback */
4136 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4137 if (!inputobj)
4138 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004139 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004140 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004141 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004142 *input = PyBytes_AS_STRING(inputobj);
4143 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004144 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004145 /* we can DECREF safely, as the exception has another reference,
4146 so the object won't go away. */
4147 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004148
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004149 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004150 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004151 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004152 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4153 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004154 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004155
Victor Stinner596a6c42011-11-09 00:02:18 +01004156 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
4157 /* need more space? (at least enough for what we
4158 have+the replacement+the rest of the string (starting
4159 at the new input position), so we won't have to check space
4160 when there are no errors in the rest of the string) */
4161 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
4162 requiredsize = *outpos + replen + insize-newpos;
4163 if (requiredsize > outsize) {
4164 if (requiredsize<2*outsize)
4165 requiredsize = 2*outsize;
4166 if (unicode_resize(output, requiredsize) < 0)
4167 goto onError;
4168 }
Victor Stinner1b487b42012-05-03 12:29:04 +02004169 if (unicode_widen(output, *outpos,
4170 PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004171 goto onError;
Victor Stinner596a6c42011-11-09 00:02:18 +01004172 copy_characters(*output, *outpos, repunicode, 0, replen);
4173 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004174 }
Victor Stinner596a6c42011-11-09 00:02:18 +01004175 else {
4176 wchar_t *repwstr;
4177 Py_ssize_t repwlen;
4178 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4179 if (repwstr == NULL)
4180 goto onError;
4181 /* need more space? (at least enough for what we
4182 have+the replacement+the rest of the string (starting
4183 at the new input position), so we won't have to check space
4184 when there are no errors in the rest of the string) */
4185 requiredsize = *outpos + repwlen + insize-newpos;
4186 if (requiredsize > outsize) {
4187 if (requiredsize < 2*outsize)
4188 requiredsize = 2*outsize;
4189 if (unicode_resize(output, requiredsize) < 0)
4190 goto onError;
4191 }
4192 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4193 *outpos += repwlen;
4194 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004195 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004196 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004197
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004198 /* we made it! */
4199 res = 0;
4200
Benjamin Peterson29060642009-01-31 22:14:21 +00004201 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004202 Py_XDECREF(restuple);
4203 return res;
4204}
4205
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004206/* --- UTF-7 Codec -------------------------------------------------------- */
4207
Antoine Pitrou244651a2009-05-04 18:56:13 +00004208/* See RFC2152 for details. We encode conservatively and decode liberally. */
4209
4210/* Three simple macros defining base-64. */
4211
4212/* Is c a base-64 character? */
4213
4214#define IS_BASE64(c) \
4215 (((c) >= 'A' && (c) <= 'Z') || \
4216 ((c) >= 'a' && (c) <= 'z') || \
4217 ((c) >= '0' && (c) <= '9') || \
4218 (c) == '+' || (c) == '/')
4219
4220/* given that c is a base-64 character, what is its base-64 value? */
4221
4222#define FROM_BASE64(c) \
4223 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4224 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4225 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4226 (c) == '+' ? 62 : 63)
4227
4228/* What is the base-64 character of the bottom 6 bits of n? */
4229
4230#define TO_BASE64(n) \
4231 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4232
4233/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4234 * decoded as itself. We are permissive on decoding; the only ASCII
4235 * byte not decoding to itself is the + which begins a base64
4236 * string. */
4237
4238#define DECODE_DIRECT(c) \
4239 ((c) <= 127 && (c) != '+')
4240
4241/* The UTF-7 encoder treats ASCII characters differently according to
4242 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4243 * the above). See RFC2152. This array identifies these different
4244 * sets:
4245 * 0 : "Set D"
4246 * alphanumeric and '(),-./:?
4247 * 1 : "Set O"
4248 * !"#$%&*;<=>@[]^_`{|}
4249 * 2 : "whitespace"
4250 * ht nl cr sp
4251 * 3 : special (must be base64 encoded)
4252 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4253 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004254
Tim Petersced69f82003-09-16 20:30:58 +00004255static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004256char utf7_category[128] = {
4257/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4258 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4259/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4260 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4261/* sp ! " # $ % & ' ( ) * + , - . / */
4262 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4263/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4264 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4265/* @ A B C D E F G H I J K L M N O */
4266 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4267/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4268 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4269/* ` a b c d e f g h i j k l m n o */
4270 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4271/* p q r s t u v w x y z { | } ~ del */
4272 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004273};
4274
Antoine Pitrou244651a2009-05-04 18:56:13 +00004275/* ENCODE_DIRECT: this character should be encoded as itself. The
4276 * answer depends on whether we are encoding set O as itself, and also
4277 * on whether we are encoding whitespace as itself. RFC2152 makes it
4278 * clear that the answers to these questions vary between
4279 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004280
Antoine Pitrou244651a2009-05-04 18:56:13 +00004281#define ENCODE_DIRECT(c, directO, directWS) \
4282 ((c) < 128 && (c) > 0 && \
4283 ((utf7_category[(c)] == 0) || \
4284 (directWS && (utf7_category[(c)] == 2)) || \
4285 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004286
Alexander Belopolsky40018472011-02-26 01:02:56 +00004287PyObject *
4288PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004289 Py_ssize_t size,
4290 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004291{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004292 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4293}
4294
Antoine Pitrou244651a2009-05-04 18:56:13 +00004295/* The decoder. The only state we preserve is our read position,
4296 * i.e. how many characters we have consumed. So if we end in the
4297 * middle of a shift sequence we have to back off the read position
4298 * and the output to the beginning of the sequence, otherwise we lose
4299 * all the shift state (seen bits, number of bits seen, high
4300 * surrogate). */
4301
Alexander Belopolsky40018472011-02-26 01:02:56 +00004302PyObject *
4303PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004304 Py_ssize_t size,
4305 const char *errors,
4306 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004307{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004308 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004309 Py_ssize_t startinpos;
4310 Py_ssize_t endinpos;
4311 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004312 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004313 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004314 const char *errmsg = "";
4315 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004316 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004317 unsigned int base64bits = 0;
4318 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004319 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004320 PyObject *errorHandler = NULL;
4321 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004322
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004323 /* Start off assuming it's all ASCII. Widen later as necessary. */
4324 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004325 if (!unicode)
4326 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004327 if (size == 0) {
4328 if (consumed)
4329 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004330 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004331 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004332
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004333 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004334 e = s + size;
4335
4336 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004337 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004338 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004339 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004340
Antoine Pitrou244651a2009-05-04 18:56:13 +00004341 if (inShift) { /* in a base-64 section */
4342 if (IS_BASE64(ch)) { /* consume a base-64 character */
4343 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4344 base64bits += 6;
4345 s++;
4346 if (base64bits >= 16) {
4347 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004348 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004349 base64bits -= 16;
4350 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4351 if (surrogate) {
4352 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004353 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4354 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004355 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
4356 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004357 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004358 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004359 }
4360 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004361 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4362 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004363 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004364 }
4365 }
Victor Stinner551ac952011-11-29 22:58:13 +01004366 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004367 /* first surrogate */
4368 surrogate = outCh;
4369 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004370 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004371 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
4372 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004373 }
4374 }
4375 }
4376 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004377 inShift = 0;
4378 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004379 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004380 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4381 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004382 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004383 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004384 if (base64bits > 0) { /* left-over bits */
4385 if (base64bits >= 6) {
4386 /* We've seen at least one base-64 character */
4387 errmsg = "partial character in shift sequence";
4388 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004389 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004390 else {
4391 /* Some bits remain; they should be zero */
4392 if (base64buffer != 0) {
4393 errmsg = "non-zero padding bits in shift sequence";
4394 goto utf7Error;
4395 }
4396 }
4397 }
4398 if (ch != '-') {
4399 /* '-' is absorbed; other terminating
4400 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004401 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4402 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004403 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004404 }
4405 }
4406 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004407 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004408 s++; /* consume '+' */
4409 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004410 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004411 if (unicode_putchar(&unicode, &outpos, '+') < 0)
4412 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004413 }
4414 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004415 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004416 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004417 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004418 }
4419 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004420 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004421 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4422 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004423 s++;
4424 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004425 else {
4426 startinpos = s-starts;
4427 s++;
4428 errmsg = "unexpected special character";
4429 goto utf7Error;
4430 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004431 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004432utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004433 endinpos = s-starts;
4434 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00004435 errors, &errorHandler,
4436 "utf7", errmsg,
4437 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004438 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004439 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004440 }
4441
Antoine Pitrou244651a2009-05-04 18:56:13 +00004442 /* end of string */
4443
4444 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4445 /* if we're in an inconsistent state, that's an error */
4446 if (surrogate ||
4447 (base64bits >= 6) ||
4448 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004449 endinpos = size;
4450 if (unicode_decode_call_errorhandler(
4451 errors, &errorHandler,
4452 "utf7", "unterminated shift sequence",
4453 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004454 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004455 goto onError;
4456 if (s < e)
4457 goto restart;
4458 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004459 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004460
4461 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004462 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004463 if (inShift) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004464 outpos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004465 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004466 }
4467 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004468 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004469 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004470 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004471
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004472 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004473 goto onError;
4474
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004475 Py_XDECREF(errorHandler);
4476 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004477 return unicode_result(unicode);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004478
Benjamin Peterson29060642009-01-31 22:14:21 +00004479 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004480 Py_XDECREF(errorHandler);
4481 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004482 Py_DECREF(unicode);
4483 return NULL;
4484}
4485
4486
Alexander Belopolsky40018472011-02-26 01:02:56 +00004487PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004488_PyUnicode_EncodeUTF7(PyObject *str,
4489 int base64SetO,
4490 int base64WhiteSpace,
4491 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004492{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004493 int kind;
4494 void *data;
4495 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004496 PyObject *v;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004497 Py_ssize_t allocated;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004498 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004499 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004500 unsigned int base64bits = 0;
4501 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004502 char * out;
4503 char * start;
4504
Benjamin Petersonbac79492012-01-14 13:34:47 -05004505 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004506 return NULL;
4507 kind = PyUnicode_KIND(str);
4508 data = PyUnicode_DATA(str);
4509 len = PyUnicode_GET_LENGTH(str);
4510
4511 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004512 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004513
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004514 /* It might be possible to tighten this worst case */
4515 allocated = 8 * len;
4516 if (allocated / 8 != len)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004517 return PyErr_NoMemory();
4518
Antoine Pitrou244651a2009-05-04 18:56:13 +00004519 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004520 if (v == NULL)
4521 return NULL;
4522
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004523 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004524 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004525 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004526
Antoine Pitrou244651a2009-05-04 18:56:13 +00004527 if (inShift) {
4528 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4529 /* shifting out */
4530 if (base64bits) { /* output remaining bits */
4531 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4532 base64buffer = 0;
4533 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004534 }
4535 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004536 /* Characters not in the BASE64 set implicitly unshift the sequence
4537 so no '-' is required, except if the character is itself a '-' */
4538 if (IS_BASE64(ch) || ch == '-') {
4539 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004540 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004541 *out++ = (char) ch;
4542 }
4543 else {
4544 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004545 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004546 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004547 else { /* not in a shift sequence */
4548 if (ch == '+') {
4549 *out++ = '+';
4550 *out++ = '-';
4551 }
4552 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4553 *out++ = (char) ch;
4554 }
4555 else {
4556 *out++ = '+';
4557 inShift = 1;
4558 goto encode_char;
4559 }
4560 }
4561 continue;
4562encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004563 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004564 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004565
Antoine Pitrou244651a2009-05-04 18:56:13 +00004566 /* code first surrogate */
4567 base64bits += 16;
4568 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4569 while (base64bits >= 6) {
4570 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4571 base64bits -= 6;
4572 }
4573 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004574 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004575 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004576 base64bits += 16;
4577 base64buffer = (base64buffer << 16) | ch;
4578 while (base64bits >= 6) {
4579 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4580 base64bits -= 6;
4581 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004582 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004583 if (base64bits)
4584 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4585 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004586 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004587 if (_PyBytes_Resize(&v, out - start) < 0)
4588 return NULL;
4589 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004590}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004591PyObject *
4592PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4593 Py_ssize_t size,
4594 int base64SetO,
4595 int base64WhiteSpace,
4596 const char *errors)
4597{
4598 PyObject *result;
4599 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4600 if (tmp == NULL)
4601 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004602 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004603 base64WhiteSpace, errors);
4604 Py_DECREF(tmp);
4605 return result;
4606}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004607
Antoine Pitrou244651a2009-05-04 18:56:13 +00004608#undef IS_BASE64
4609#undef FROM_BASE64
4610#undef TO_BASE64
4611#undef DECODE_DIRECT
4612#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004613
Guido van Rossumd57fd912000-03-10 22:53:23 +00004614/* --- UTF-8 Codec -------------------------------------------------------- */
4615
Tim Petersced69f82003-09-16 20:30:58 +00004616static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004617char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004618 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4619 illegal prefix. See RFC 3629 for details */
4620 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4621 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004622 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004623 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4624 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4625 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4626 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004627 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4628 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004629 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4630 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004631 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4632 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4633 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4634 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4635 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004636};
4637
Alexander Belopolsky40018472011-02-26 01:02:56 +00004638PyObject *
4639PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004640 Py_ssize_t size,
4641 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004642{
Walter Dörwald69652032004-09-07 20:24:22 +00004643 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4644}
4645
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004646#include "stringlib/ucs1lib.h"
4647#include "stringlib/codecs.h"
4648#include "stringlib/undef.h"
4649
4650#include "stringlib/ucs2lib.h"
4651#include "stringlib/codecs.h"
4652#include "stringlib/undef.h"
4653
4654#include "stringlib/ucs4lib.h"
4655#include "stringlib/codecs.h"
4656#include "stringlib/undef.h"
4657
Antoine Pitrouab868312009-01-10 15:40:25 +00004658/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4659#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4660
4661/* Mask to quickly check whether a C 'long' contains a
4662 non-ASCII, UTF8-encoded char. */
4663#if (SIZEOF_LONG == 8)
4664# define ASCII_CHAR_MASK 0x8080808080808080L
4665#elif (SIZEOF_LONG == 4)
4666# define ASCII_CHAR_MASK 0x80808080L
4667#else
4668# error C 'long' size should be either 4 or 8!
4669#endif
4670
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004671/* Scans a UTF-8 string and returns the maximum character to be expected
4672 and the size of the decoded unicode string.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004673
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004674 This function doesn't check for errors, these checks are performed in
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004675 PyUnicode_DecodeUTF8Stateful.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004676 */
4677static Py_UCS4
Victor Stinnera1d12bb2011-12-11 21:53:09 +01004678utf8_scanner(const unsigned char *p, Py_ssize_t string_size, Py_ssize_t *unicode_size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004679{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004680 Py_ssize_t char_count = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004681 const unsigned char *end = p + string_size;
4682 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004683
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004684 assert(unicode_size != NULL);
4685
4686 /* By having a cascade of independent loops which fallback onto each
4687 other, we minimize the amount of work done in the average loop
4688 iteration, and we also maximize the CPU's ability to predict
4689 branches correctly (because a given condition will have always the
4690 same boolean outcome except perhaps in the last iteration of the
4691 corresponding loop).
4692 In the general case this brings us rather close to decoding
4693 performance pre-PEP 393, despite the two-pass decoding.
4694
4695 Note that the pure ASCII loop is not duplicated once a non-ASCII
4696 character has been encountered. It is actually a pessimization (by
4697 a significant factor) to use this loop on text with many non-ASCII
4698 characters, and it is important to avoid bad performance on valid
4699 utf-8 data (invalid utf-8 being a different can of worms).
4700 */
4701
4702 /* ASCII */
4703 for (; p < end; ++p) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004704 /* Only check value if it's not a ASCII char... */
4705 if (*p < 0x80) {
4706 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4707 an explanation. */
4708 if (!((size_t) p & LONG_PTR_MASK)) {
4709 /* Help register allocation */
4710 register const unsigned char *_p = p;
4711 while (_p < aligned_end) {
4712 unsigned long value = *(unsigned long *) _p;
4713 if (value & ASCII_CHAR_MASK)
4714 break;
4715 _p += SIZEOF_LONG;
4716 char_count += SIZEOF_LONG;
4717 }
4718 p = _p;
4719 if (p == end)
4720 break;
4721 }
4722 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004723 if (*p < 0x80)
4724 ++char_count;
4725 else
4726 goto _ucs1loop;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004727 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004728 *unicode_size = char_count;
4729 return 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004730
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004731_ucs1loop:
4732 for (; p < end; ++p) {
4733 if (*p < 0xc4)
4734 char_count += ((*p & 0xc0) != 0x80);
4735 else
4736 goto _ucs2loop;
4737 }
4738 *unicode_size = char_count;
4739 return 255;
4740
4741_ucs2loop:
4742 for (; p < end; ++p) {
4743 if (*p < 0xf0)
4744 char_count += ((*p & 0xc0) != 0x80);
4745 else
4746 goto _ucs4loop;
4747 }
4748 *unicode_size = char_count;
4749 return 65535;
4750
4751_ucs4loop:
4752 for (; p < end; ++p) {
4753 char_count += ((*p & 0xc0) != 0x80);
4754 }
4755 *unicode_size = char_count;
4756 return 65537;
4757}
4758
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004759/* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
Victor Stinner785938e2011-12-11 20:09:03 +01004760 in case of errors. Implicit parameters: unicode, kind, data, onError.
4761 Potential resizing overallocates, so the result needs to shrink at the end.
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004762*/
Victor Stinner785938e2011-12-11 20:09:03 +01004763#define WRITE_MAYBE_FAIL(index, value) \
4764 do { \
4765 Py_ssize_t pos = index; \
4766 if (pos > PyUnicode_GET_LENGTH(unicode) && \
4767 unicode_resize(&unicode, pos + pos/8) < 0) \
4768 goto onError; \
4769 if (unicode_putchar(&unicode, &pos, value) < 0) \
4770 goto onError; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004771 } while (0)
4772
Victor Stinnerbf6e5602011-12-12 01:53:47 +01004773static PyObject *
Victor Stinner785938e2011-12-11 20:09:03 +01004774decode_utf8_errors(const char *starts,
4775 Py_ssize_t size,
4776 const char *errors,
4777 Py_ssize_t *consumed,
4778 const char *s,
4779 PyObject *unicode,
4780 Py_ssize_t i)
Walter Dörwald69652032004-09-07 20:24:22 +00004781{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004782 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004783 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004784 Py_ssize_t startinpos;
4785 Py_ssize_t endinpos;
Victor Stinner785938e2011-12-11 20:09:03 +01004786 const char *e = starts + size;
4787 const char *aligned_end;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004788 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004789 PyObject *errorHandler = NULL;
4790 PyObject *exc = NULL;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004791
Antoine Pitrouab868312009-01-10 15:40:25 +00004792 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004793
4794 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004795 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004796
4797 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004798 /* Fast path for runs of ASCII characters. Given that common UTF-8
4799 input will consist of an overwhelming majority of ASCII
4800 characters, we try to optimize for this case by checking
4801 as many characters as a C 'long' can contain.
4802 First, check if we can do an aligned read, as most CPUs have
4803 a penalty for unaligned reads.
4804 */
4805 if (!((size_t) s & LONG_PTR_MASK)) {
4806 /* Help register allocation */
4807 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004808 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004809 while (_s < aligned_end) {
4810 /* Read a whole long at a time (either 4 or 8 bytes),
4811 and do a fast unrolled copy if it only contains ASCII
4812 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004813 unsigned long value = *(unsigned long *) _s;
4814 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004815 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004816 WRITE_MAYBE_FAIL(_i+0, _s[0]);
4817 WRITE_MAYBE_FAIL(_i+1, _s[1]);
4818 WRITE_MAYBE_FAIL(_i+2, _s[2]);
4819 WRITE_MAYBE_FAIL(_i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004820#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004821 WRITE_MAYBE_FAIL(_i+4, _s[4]);
4822 WRITE_MAYBE_FAIL(_i+5, _s[5]);
4823 WRITE_MAYBE_FAIL(_i+6, _s[6]);
4824 WRITE_MAYBE_FAIL(_i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004825#endif
4826 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004827 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004828 }
4829 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004830 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004831 if (s == e)
4832 break;
4833 ch = (unsigned char)*s;
4834 }
4835 }
4836
4837 if (ch < 0x80) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004838 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004839 s++;
4840 continue;
4841 }
4842
4843 n = utf8_code_length[ch];
4844
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004845 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004846 if (consumed)
4847 break;
4848 else {
4849 errmsg = "unexpected end of data";
4850 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004851 endinpos = startinpos+1;
4852 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4853 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004854 goto utf8Error;
4855 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004856 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004857
4858 switch (n) {
4859
4860 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004861 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004862 startinpos = s-starts;
4863 endinpos = startinpos+1;
4864 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004865
4866 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004867 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004868 startinpos = s-starts;
4869 endinpos = startinpos+1;
4870 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004871
4872 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004873 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004874 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004875 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004876 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004877 goto utf8Error;
4878 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004879 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004880 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004881 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004882 break;
4883
4884 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004885 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4886 will result in surrogates in range d800-dfff. Surrogates are
4887 not valid UTF-8 so they are rejected.
4888 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4889 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004890 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004891 (s[2] & 0xc0) != 0x80 ||
4892 ((unsigned char)s[0] == 0xE0 &&
4893 (unsigned char)s[1] < 0xA0) ||
4894 ((unsigned char)s[0] == 0xED &&
4895 (unsigned char)s[1] > 0x9F)) {
4896 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004897 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004898 endinpos = startinpos + 1;
4899
4900 /* if s[1] first two bits are 1 and 0, then the invalid
4901 continuation byte is s[2], so increment endinpos by 1,
4902 if not, s[1] is invalid and endinpos doesn't need to
4903 be incremented. */
4904 if ((s[1] & 0xC0) == 0x80)
4905 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004906 goto utf8Error;
4907 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004908 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004909 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004910 WRITE_MAYBE_FAIL(i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004911 break;
4912
4913 case 4:
4914 if ((s[1] & 0xc0) != 0x80 ||
4915 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004916 (s[3] & 0xc0) != 0x80 ||
4917 ((unsigned char)s[0] == 0xF0 &&
4918 (unsigned char)s[1] < 0x90) ||
4919 ((unsigned char)s[0] == 0xF4 &&
4920 (unsigned char)s[1] > 0x8F)) {
4921 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004922 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004923 endinpos = startinpos + 1;
4924 if ((s[1] & 0xC0) == 0x80) {
4925 endinpos++;
4926 if ((s[2] & 0xC0) == 0x80)
4927 endinpos++;
4928 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004929 goto utf8Error;
4930 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004931 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004932 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01004933 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Ezio Melotti57221d02010-07-01 07:32:02 +00004934
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004935 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004936 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004937 }
4938 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004939 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004940
Benjamin Peterson29060642009-01-31 22:14:21 +00004941 utf8Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00004942 if (unicode_decode_call_errorhandler(
4943 errors, &errorHandler,
Victor Stinnercbe01342012-02-14 01:17:45 +01004944 "utf-8", errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00004945 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004946 &unicode, &i))
Benjamin Peterson29060642009-01-31 22:14:21 +00004947 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004948 /* Update data because unicode_decode_call_errorhandler might have
4949 re-created or resized the unicode object. */
Benjamin Peterson29060642009-01-31 22:14:21 +00004950 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004951 }
Walter Dörwald69652032004-09-07 20:24:22 +00004952 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004953 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004954
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004955 /* Adjust length and ready string when it contained errors and
4956 is of the old resizable kind. */
Victor Stinner785938e2011-12-11 20:09:03 +01004957 if (unicode_resize(&unicode, i) < 0)
4958 goto onError;
4959 unicode_adjust_maxchar(&unicode);
4960 if (unicode == NULL)
4961 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004962
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004963 Py_XDECREF(errorHandler);
4964 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004965 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01004966 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004967
Benjamin Peterson29060642009-01-31 22:14:21 +00004968 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004969 Py_XDECREF(errorHandler);
4970 Py_XDECREF(exc);
Victor Stinner785938e2011-12-11 20:09:03 +01004971 Py_XDECREF(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004972 return NULL;
4973}
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004974#undef WRITE_MAYBE_FAIL
Antoine Pitrouab868312009-01-10 15:40:25 +00004975
Victor Stinner785938e2011-12-11 20:09:03 +01004976PyObject *
4977PyUnicode_DecodeUTF8Stateful(const char *s,
4978 Py_ssize_t size,
4979 const char *errors,
4980 Py_ssize_t *consumed)
4981{
4982 Py_UCS4 maxchar = 0;
4983 Py_ssize_t unicode_size;
4984 int has_errors = 0;
4985 PyObject *unicode;
4986 int kind;
4987 void *data;
4988 const char *starts = s;
4989 const char *e;
4990 Py_ssize_t i;
4991
4992 if (size == 0) {
4993 if (consumed)
4994 *consumed = 0;
Victor Stinner382955f2011-12-11 21:44:00 +01004995 Py_INCREF(unicode_empty);
4996 return unicode_empty;
Victor Stinner785938e2011-12-11 20:09:03 +01004997 }
4998
Victor Stinnera1d12bb2011-12-11 21:53:09 +01004999 maxchar = utf8_scanner((const unsigned char *)s, size, &unicode_size);
Victor Stinner785938e2011-12-11 20:09:03 +01005000
5001 /* When the string is ASCII only, just use memcpy and return.
5002 unicode_size may be != size if there is an incomplete UTF-8
5003 sequence at the end of the ASCII block. */
5004 if (maxchar < 128 && size == unicode_size) {
5005 if (consumed)
5006 *consumed = size;
Victor Stinnerab870212011-12-17 22:39:43 +01005007 return unicode_fromascii((const unsigned char *)s, size);
Victor Stinner785938e2011-12-11 20:09:03 +01005008 }
5009
5010 unicode = PyUnicode_New(unicode_size, maxchar);
5011 if (!unicode)
5012 return NULL;
5013 kind = PyUnicode_KIND(unicode);
5014 data = PyUnicode_DATA(unicode);
5015
5016 /* Unpack UTF-8 encoded data */
5017 i = 0;
5018 e = starts + size;
5019 switch (kind) {
5020 case PyUnicode_1BYTE_KIND:
5021 has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
5022 break;
5023 case PyUnicode_2BYTE_KIND:
5024 has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
5025 break;
5026 case PyUnicode_4BYTE_KIND:
5027 has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
5028 break;
5029 }
5030 if (!has_errors) {
5031 /* Ensure the unicode size calculation was correct */
5032 assert(i == unicode_size);
5033 assert(s == e);
5034 if (consumed)
5035 *consumed = size;
5036 return unicode;
5037 }
5038
5039 /* In case of errors, maxchar and size computation might be incorrect;
5040 code below refits and resizes as necessary. */
5041 return decode_utf8_errors(starts, size, errors, consumed, s, unicode, i);
5042}
5043
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005044#ifdef __APPLE__
5045
5046/* Simplified UTF-8 decoder using surrogateescape error handler,
5047 used to decode the command line arguments on Mac OS X. */
5048
5049wchar_t*
5050_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
5051{
5052 int n;
5053 const char *e;
5054 wchar_t *unicode, *p;
5055
5056 /* Note: size will always be longer than the resulting Unicode
5057 character count */
5058 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
5059 PyErr_NoMemory();
5060 return NULL;
5061 }
5062 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
5063 if (!unicode)
5064 return NULL;
5065
5066 /* Unpack UTF-8 encoded data */
5067 p = unicode;
5068 e = s + size;
5069 while (s < e) {
5070 Py_UCS4 ch = (unsigned char)*s;
5071
5072 if (ch < 0x80) {
5073 *p++ = (wchar_t)ch;
5074 s++;
5075 continue;
5076 }
5077
5078 n = utf8_code_length[ch];
5079 if (s + n > e) {
5080 goto surrogateescape;
5081 }
5082
5083 switch (n) {
5084 case 0:
5085 case 1:
5086 goto surrogateescape;
5087
5088 case 2:
5089 if ((s[1] & 0xc0) != 0x80)
5090 goto surrogateescape;
5091 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
5092 assert ((ch > 0x007F) && (ch <= 0x07FF));
5093 *p++ = (wchar_t)ch;
5094 break;
5095
5096 case 3:
5097 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
5098 will result in surrogates in range d800-dfff. Surrogates are
5099 not valid UTF-8 so they are rejected.
5100 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
5101 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
5102 if ((s[1] & 0xc0) != 0x80 ||
5103 (s[2] & 0xc0) != 0x80 ||
5104 ((unsigned char)s[0] == 0xE0 &&
5105 (unsigned char)s[1] < 0xA0) ||
5106 ((unsigned char)s[0] == 0xED &&
5107 (unsigned char)s[1] > 0x9F)) {
5108
5109 goto surrogateescape;
5110 }
5111 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5112 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005113 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005114 break;
5115
5116 case 4:
5117 if ((s[1] & 0xc0) != 0x80 ||
5118 (s[2] & 0xc0) != 0x80 ||
5119 (s[3] & 0xc0) != 0x80 ||
5120 ((unsigned char)s[0] == 0xF0 &&
5121 (unsigned char)s[1] < 0x90) ||
5122 ((unsigned char)s[0] == 0xF4 &&
5123 (unsigned char)s[1] > 0x8F)) {
5124 goto surrogateescape;
5125 }
5126 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
5127 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01005128 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005129
5130#if SIZEOF_WCHAR_T == 4
5131 *p++ = (wchar_t)ch;
5132#else
5133 /* compute and append the two surrogates: */
Victor Stinner551ac952011-11-29 22:58:13 +01005134 *p++ = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5135 *p++ = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005136#endif
5137 break;
5138 }
5139 s += n;
5140 continue;
5141
5142 surrogateescape:
5143 *p++ = 0xDC00 + ch;
5144 s++;
5145 }
5146 *p = L'\0';
5147 return unicode;
5148}
5149
5150#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00005151
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005152/* Primary internal function which creates utf8 encoded bytes objects.
5153
5154 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005155 and allocate exactly as much space needed at the end. Else allocate the
5156 maximum possible needed (4 result bytes per Unicode character), and return
5157 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005158*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005159PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005160_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005161{
Victor Stinner6099a032011-12-18 14:22:26 +01005162 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005163 void *data;
5164 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005165
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005166 if (!PyUnicode_Check(unicode)) {
5167 PyErr_BadArgument();
5168 return NULL;
5169 }
5170
5171 if (PyUnicode_READY(unicode) == -1)
5172 return NULL;
5173
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005174 if (PyUnicode_UTF8(unicode))
5175 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5176 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005177
5178 kind = PyUnicode_KIND(unicode);
5179 data = PyUnicode_DATA(unicode);
5180 size = PyUnicode_GET_LENGTH(unicode);
5181
Benjamin Petersonead6b532011-12-20 17:23:42 -06005182 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005183 default:
5184 assert(0);
5185 case PyUnicode_1BYTE_KIND:
5186 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5187 assert(!PyUnicode_IS_ASCII(unicode));
5188 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5189 case PyUnicode_2BYTE_KIND:
5190 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5191 case PyUnicode_4BYTE_KIND:
5192 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005193 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005194}
5195
Alexander Belopolsky40018472011-02-26 01:02:56 +00005196PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005197PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5198 Py_ssize_t size,
5199 const char *errors)
5200{
5201 PyObject *v, *unicode;
5202
5203 unicode = PyUnicode_FromUnicode(s, size);
5204 if (unicode == NULL)
5205 return NULL;
5206 v = _PyUnicode_AsUTF8String(unicode, errors);
5207 Py_DECREF(unicode);
5208 return v;
5209}
5210
5211PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005212PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005213{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005214 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005215}
5216
Walter Dörwald41980ca2007-08-16 21:55:45 +00005217/* --- UTF-32 Codec ------------------------------------------------------- */
5218
5219PyObject *
5220PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005221 Py_ssize_t size,
5222 const char *errors,
5223 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005224{
5225 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5226}
5227
5228PyObject *
5229PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005230 Py_ssize_t size,
5231 const char *errors,
5232 int *byteorder,
5233 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005234{
5235 const char *starts = s;
5236 Py_ssize_t startinpos;
5237 Py_ssize_t endinpos;
5238 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005239 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005240 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005241 int bo = 0; /* assume native ordering by default */
5242 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005243 /* Offsets from q for retrieving bytes in the right order. */
5244#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5245 int iorder[] = {0, 1, 2, 3};
5246#else
5247 int iorder[] = {3, 2, 1, 0};
5248#endif
5249 PyObject *errorHandler = NULL;
5250 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005251
Walter Dörwald41980ca2007-08-16 21:55:45 +00005252 q = (unsigned char *)s;
5253 e = q + size;
5254
5255 if (byteorder)
5256 bo = *byteorder;
5257
5258 /* Check for BOM marks (U+FEFF) in the input and adjust current
5259 byte order setting accordingly. In native mode, the leading BOM
5260 mark is skipped, in all other modes, it is copied to the output
5261 stream as-is (giving a ZWNBSP character). */
5262 if (bo == 0) {
5263 if (size >= 4) {
5264 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00005265 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005266#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005267 if (bom == 0x0000FEFF) {
5268 q += 4;
5269 bo = -1;
5270 }
5271 else if (bom == 0xFFFE0000) {
5272 q += 4;
5273 bo = 1;
5274 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005275#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005276 if (bom == 0x0000FEFF) {
5277 q += 4;
5278 bo = 1;
5279 }
5280 else if (bom == 0xFFFE0000) {
5281 q += 4;
5282 bo = -1;
5283 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005284#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005285 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005286 }
5287
5288 if (bo == -1) {
5289 /* force LE */
5290 iorder[0] = 0;
5291 iorder[1] = 1;
5292 iorder[2] = 2;
5293 iorder[3] = 3;
5294 }
5295 else if (bo == 1) {
5296 /* force BE */
5297 iorder[0] = 3;
5298 iorder[1] = 2;
5299 iorder[2] = 1;
5300 iorder[3] = 0;
5301 }
5302
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005303 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005304 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005305 if (!unicode)
5306 return NULL;
5307 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005308 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005309 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005310
Walter Dörwald41980ca2007-08-16 21:55:45 +00005311 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005312 Py_UCS4 ch;
5313 /* remaining bytes at the end? (size should be divisible by 4) */
5314 if (e-q<4) {
5315 if (consumed)
5316 break;
5317 errmsg = "truncated data";
5318 startinpos = ((const char *)q)-starts;
5319 endinpos = ((const char *)e)-starts;
5320 goto utf32Error;
5321 /* The remaining input chars are ignored if the callback
5322 chooses to skip the input */
5323 }
5324 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5325 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005326
Benjamin Peterson29060642009-01-31 22:14:21 +00005327 if (ch >= 0x110000)
5328 {
5329 errmsg = "codepoint not in range(0x110000)";
5330 startinpos = ((const char *)q)-starts;
5331 endinpos = startinpos+4;
5332 goto utf32Error;
5333 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005334 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5335 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005336 q += 4;
5337 continue;
5338 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005339 if (unicode_decode_call_errorhandler(
5340 errors, &errorHandler,
5341 "utf32", errmsg,
5342 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005343 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005344 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005345 }
5346
5347 if (byteorder)
5348 *byteorder = bo;
5349
5350 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005351 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005352
5353 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005354 if (unicode_resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005355 goto onError;
5356
5357 Py_XDECREF(errorHandler);
5358 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005359 return unicode_result(unicode);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005360
Benjamin Peterson29060642009-01-31 22:14:21 +00005361 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005362 Py_DECREF(unicode);
5363 Py_XDECREF(errorHandler);
5364 Py_XDECREF(exc);
5365 return NULL;
5366}
5367
5368PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005369_PyUnicode_EncodeUTF32(PyObject *str,
5370 const char *errors,
5371 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005372{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005373 int kind;
5374 void *data;
5375 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005376 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005377 unsigned char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005378 Py_ssize_t nsize, bytesize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005379 /* Offsets from p for storing byte pairs in the right order. */
5380#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5381 int iorder[] = {0, 1, 2, 3};
5382#else
5383 int iorder[] = {3, 2, 1, 0};
5384#endif
5385
Benjamin Peterson29060642009-01-31 22:14:21 +00005386#define STORECHAR(CH) \
5387 do { \
5388 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5389 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5390 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5391 p[iorder[0]] = (CH) & 0xff; \
5392 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005393 } while(0)
5394
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005395 if (!PyUnicode_Check(str)) {
5396 PyErr_BadArgument();
5397 return NULL;
5398 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005399 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005400 return NULL;
5401 kind = PyUnicode_KIND(str);
5402 data = PyUnicode_DATA(str);
5403 len = PyUnicode_GET_LENGTH(str);
5404
5405 nsize = len + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005406 bytesize = nsize * 4;
5407 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005408 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005409 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005410 if (v == NULL)
5411 return NULL;
5412
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005413 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005414 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005415 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005416 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005417 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005418
5419 if (byteorder == -1) {
5420 /* force LE */
5421 iorder[0] = 0;
5422 iorder[1] = 1;
5423 iorder[2] = 2;
5424 iorder[3] = 3;
5425 }
5426 else if (byteorder == 1) {
5427 /* force BE */
5428 iorder[0] = 3;
5429 iorder[1] = 2;
5430 iorder[2] = 1;
5431 iorder[3] = 0;
5432 }
5433
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005434 for (i = 0; i < len; i++)
5435 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005436
5437 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005438 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005439#undef STORECHAR
5440}
5441
Alexander Belopolsky40018472011-02-26 01:02:56 +00005442PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005443PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5444 Py_ssize_t size,
5445 const char *errors,
5446 int byteorder)
5447{
5448 PyObject *result;
5449 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5450 if (tmp == NULL)
5451 return NULL;
5452 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5453 Py_DECREF(tmp);
5454 return result;
5455}
5456
5457PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005458PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005459{
Victor Stinnerb960b342011-11-20 19:12:52 +01005460 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005461}
5462
Guido van Rossumd57fd912000-03-10 22:53:23 +00005463/* --- UTF-16 Codec ------------------------------------------------------- */
5464
Tim Peters772747b2001-08-09 22:21:55 +00005465PyObject *
5466PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005467 Py_ssize_t size,
5468 const char *errors,
5469 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005470{
Walter Dörwald69652032004-09-07 20:24:22 +00005471 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5472}
5473
Antoine Pitrouab868312009-01-10 15:40:25 +00005474/* Two masks for fast checking of whether a C 'long' may contain
5475 UTF16-encoded surrogate characters. This is an efficient heuristic,
5476 assuming that non-surrogate characters with a code point >= 0x8000 are
5477 rare in most input.
5478 FAST_CHAR_MASK is used when the input is in native byte ordering,
5479 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005480*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005481#if (SIZEOF_LONG == 8)
5482# define FAST_CHAR_MASK 0x8000800080008000L
5483# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
Victor Stinnerafb52052012-04-05 22:54:49 +02005484# define STRIPPED_MASK 0x00FF00FF00FF00FFL
Antoine Pitrouab868312009-01-10 15:40:25 +00005485#elif (SIZEOF_LONG == 4)
5486# define FAST_CHAR_MASK 0x80008000L
5487# define SWAPPED_FAST_CHAR_MASK 0x00800080L
Victor Stinnerafb52052012-04-05 22:54:49 +02005488# define STRIPPED_MASK 0x00FF00FFL
Antoine Pitrouab868312009-01-10 15:40:25 +00005489#else
5490# error C 'long' size should be either 4 or 8!
5491#endif
5492
Walter Dörwald69652032004-09-07 20:24:22 +00005493PyObject *
5494PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005495 Py_ssize_t size,
5496 const char *errors,
5497 int *byteorder,
5498 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005499{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005500 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005501 Py_ssize_t startinpos;
5502 Py_ssize_t endinpos;
5503 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005504 PyObject *unicode;
Antoine Pitrouab868312009-01-10 15:40:25 +00005505 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005506 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005507 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005508 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005509 /* Offsets from q for retrieving byte pairs in the right order. */
5510#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5511 int ihi = 1, ilo = 0;
5512#else
5513 int ihi = 0, ilo = 1;
5514#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005515 PyObject *errorHandler = NULL;
5516 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005517
5518 /* Note: size will always be longer than the resulting Unicode
5519 character count */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005520 unicode = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005521 if (!unicode)
5522 return NULL;
5523 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005524 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005525 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005526
Tim Peters772747b2001-08-09 22:21:55 +00005527 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005528 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005529
5530 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005531 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005532
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005533 /* Check for BOM marks (U+FEFF) in the input and adjust current
5534 byte order setting accordingly. In native mode, the leading BOM
5535 mark is skipped, in all other modes, it is copied to the output
5536 stream as-is (giving a ZWNBSP character). */
5537 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005538 if (size >= 2) {
Victor Stinner24729f32011-11-10 20:31:37 +01005539 const Py_UCS4 bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005540#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005541 if (bom == 0xFEFF) {
5542 q += 2;
5543 bo = -1;
5544 }
5545 else if (bom == 0xFFFE) {
5546 q += 2;
5547 bo = 1;
5548 }
Tim Petersced69f82003-09-16 20:30:58 +00005549#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005550 if (bom == 0xFEFF) {
5551 q += 2;
5552 bo = 1;
5553 }
5554 else if (bom == 0xFFFE) {
5555 q += 2;
5556 bo = -1;
5557 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005558#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005559 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005560 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005561
Tim Peters772747b2001-08-09 22:21:55 +00005562 if (bo == -1) {
5563 /* force LE */
5564 ihi = 1;
5565 ilo = 0;
5566 }
5567 else if (bo == 1) {
5568 /* force BE */
5569 ihi = 0;
5570 ilo = 1;
5571 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005572#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5573 native_ordering = ilo < ihi;
5574#else
5575 native_ordering = ilo > ihi;
5576#endif
Tim Peters772747b2001-08-09 22:21:55 +00005577
Antoine Pitrouab868312009-01-10 15:40:25 +00005578 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005579 while (q < e) {
Victor Stinner24729f32011-11-10 20:31:37 +01005580 Py_UCS4 ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005581 /* First check for possible aligned read of a C 'long'. Unaligned
5582 reads are more expensive, better to defer to another iteration. */
5583 if (!((size_t) q & LONG_PTR_MASK)) {
5584 /* Fast path for runs of non-surrogate chars. */
5585 register const unsigned char *_q = q;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005586 int kind = PyUnicode_KIND(unicode);
5587 void *data = PyUnicode_DATA(unicode);
5588 while (_q < aligned_end) {
Victor Stinnerafb52052012-04-05 22:54:49 +02005589 unsigned long block = * (unsigned long *) _q;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005590 Py_UCS4 maxch;
5591 if (native_ordering) {
5592 /* Can use buffer directly */
Victor Stinnerafb52052012-04-05 22:54:49 +02005593 if (block & FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005594 break;
Antoine Pitrouab868312009-01-10 15:40:25 +00005595 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005596 else {
5597 /* Need to byte-swap */
Victor Stinnerafb52052012-04-05 22:54:49 +02005598 if (block & SWAPPED_FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005599 break;
Victor Stinnerafb52052012-04-05 22:54:49 +02005600 block = ((block >> 8) & STRIPPED_MASK) |
5601 ((block & STRIPPED_MASK) << 8);
Antoine Pitrouab868312009-01-10 15:40:25 +00005602 }
Victor Stinnerafb52052012-04-05 22:54:49 +02005603 maxch = (Py_UCS2)(block & 0xFFFF);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005604#if SIZEOF_LONG == 8
Victor Stinnerafb52052012-04-05 22:54:49 +02005605 ch = (Py_UCS2)((block >> 16) & 0xFFFF);
Victor Stinnere6abb482012-05-02 01:15:40 +02005606 maxch = MAX_MAXCHAR(maxch, ch);
Victor Stinnerafb52052012-04-05 22:54:49 +02005607 ch = (Py_UCS2)((block >> 32) & 0xFFFF);
Victor Stinnere6abb482012-05-02 01:15:40 +02005608 maxch = MAX_MAXCHAR(maxch, ch);
Victor Stinnerafb52052012-04-05 22:54:49 +02005609 ch = (Py_UCS2)(block >> 48);
Victor Stinnere6abb482012-05-02 01:15:40 +02005610 maxch = MAX_MAXCHAR(maxch, ch);
Victor Stinnerafb52052012-04-05 22:54:49 +02005611#else
5612 ch = (Py_UCS2)(block >> 16);
Victor Stinnere6abb482012-05-02 01:15:40 +02005613 maxch = MAX_MAXCHAR(maxch, ch);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005614#endif
5615 if (maxch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
Victor Stinner1b487b42012-05-03 12:29:04 +02005616 if (unicode_widen(&unicode, outpos, maxch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005617 goto onError;
5618 kind = PyUnicode_KIND(unicode);
5619 data = PyUnicode_DATA(unicode);
5620 }
Victor Stinnerafb52052012-04-05 22:54:49 +02005621#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5622 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block & 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005623#if SIZEOF_LONG == 8
Victor Stinnerafb52052012-04-05 22:54:49 +02005624 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 16) & 0xFFFF));
5625 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 32) & 0xFFFF));
5626 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 48)));
5627#else
5628 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block >> 16));
5629#endif
5630#else
5631#if SIZEOF_LONG == 8
5632 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 48)));
5633 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 32) & 0xFFFF));
5634 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 16) & 0xFFFF));
5635#else
5636 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block >> 16));
5637#endif
5638 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block & 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005639#endif
5640 _q += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00005641 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005642 q = _q;
5643 if (q >= e)
5644 break;
5645 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005646 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005647
Benjamin Peterson14339b62009-01-31 16:36:08 +00005648 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005649
Victor Stinner551ac952011-11-29 22:58:13 +01005650 if (!Py_UNICODE_IS_SURROGATE(ch)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005651 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5652 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005653 continue;
5654 }
5655
5656 /* UTF-16 code pair: */
5657 if (q > e) {
5658 errmsg = "unexpected end of data";
5659 startinpos = (((const char *)q) - 2) - starts;
5660 endinpos = ((const char *)e) + 1 - starts;
5661 goto utf16Error;
5662 }
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005663 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)) {
5664 Py_UCS4 ch2 = (q[ihi] << 8) | q[ilo];
Benjamin Peterson29060642009-01-31 22:14:21 +00005665 q += 2;
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005666 if (Py_UNICODE_IS_LOW_SURROGATE(ch2)) {
Victor Stinner62aa4d02011-11-09 00:03:45 +01005667 if (unicode_putchar(&unicode, &outpos,
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005668 Py_UNICODE_JOIN_SURROGATES(ch, ch2)) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005669 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005670 continue;
5671 }
5672 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005673 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005674 startinpos = (((const char *)q)-4)-starts;
5675 endinpos = startinpos+2;
5676 goto utf16Error;
5677 }
5678
Benjamin Peterson14339b62009-01-31 16:36:08 +00005679 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005680 errmsg = "illegal encoding";
5681 startinpos = (((const char *)q)-2)-starts;
5682 endinpos = startinpos+2;
5683 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005684
Benjamin Peterson29060642009-01-31 22:14:21 +00005685 utf16Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005686 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005687 errors,
5688 &errorHandler,
5689 "utf16", errmsg,
5690 &starts,
5691 (const char **)&e,
5692 &startinpos,
5693 &endinpos,
5694 &exc,
5695 (const char **)&q,
5696 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005697 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005698 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005700 /* remaining byte at the end? (size should be even) */
5701 if (e == q) {
5702 if (!consumed) {
5703 errmsg = "truncated data";
5704 startinpos = ((const char *)q) - starts;
5705 endinpos = ((const char *)e) + 1 - starts;
Antoine Pitrouab868312009-01-10 15:40:25 +00005706 if (unicode_decode_call_errorhandler(
5707 errors,
5708 &errorHandler,
5709 "utf16", errmsg,
5710 &starts,
5711 (const char **)&e,
5712 &startinpos,
5713 &endinpos,
5714 &exc,
5715 (const char **)&q,
5716 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005717 &outpos))
Antoine Pitrouab868312009-01-10 15:40:25 +00005718 goto onError;
5719 /* The remaining input chars are ignored if the callback
5720 chooses to skip the input */
5721 }
5722 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005723
5724 if (byteorder)
5725 *byteorder = bo;
5726
Walter Dörwald69652032004-09-07 20:24:22 +00005727 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005728 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005729
Guido van Rossumd57fd912000-03-10 22:53:23 +00005730 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005731 if (unicode_resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005732 goto onError;
5733
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005734 Py_XDECREF(errorHandler);
5735 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005736 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005737
Benjamin Peterson29060642009-01-31 22:14:21 +00005738 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005739 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005740 Py_XDECREF(errorHandler);
5741 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005742 return NULL;
5743}
5744
Antoine Pitrouab868312009-01-10 15:40:25 +00005745#undef FAST_CHAR_MASK
5746#undef SWAPPED_FAST_CHAR_MASK
5747
Tim Peters772747b2001-08-09 22:21:55 +00005748PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005749_PyUnicode_EncodeUTF16(PyObject *str,
5750 const char *errors,
5751 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005752{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005753 int kind;
5754 void *data;
5755 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005756 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005757 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005758 Py_ssize_t nsize, bytesize;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005759 Py_ssize_t i, pairs;
Tim Peters772747b2001-08-09 22:21:55 +00005760 /* Offsets from p for storing byte pairs in the right order. */
5761#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5762 int ihi = 1, ilo = 0;
5763#else
5764 int ihi = 0, ilo = 1;
5765#endif
5766
Benjamin Peterson29060642009-01-31 22:14:21 +00005767#define STORECHAR(CH) \
5768 do { \
5769 p[ihi] = ((CH) >> 8) & 0xff; \
5770 p[ilo] = (CH) & 0xff; \
5771 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005772 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005773
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005774 if (!PyUnicode_Check(str)) {
5775 PyErr_BadArgument();
5776 return NULL;
5777 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005778 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005779 return NULL;
5780 kind = PyUnicode_KIND(str);
5781 data = PyUnicode_DATA(str);
5782 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005783
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005784 pairs = 0;
5785 if (kind == PyUnicode_4BYTE_KIND)
5786 for (i = 0; i < len; i++)
5787 if (PyUnicode_READ(kind, data, i) >= 0x10000)
5788 pairs++;
5789 /* 2 * (len + pairs + (byteorder == 0)) */
5790 if (len > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005791 return PyErr_NoMemory();
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005792 nsize = len + pairs + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005793 bytesize = nsize * 2;
5794 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005795 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005796 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005797 if (v == NULL)
5798 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005799
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005800 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005801 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005802 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005803 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005804 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005805
5806 if (byteorder == -1) {
5807 /* force LE */
5808 ihi = 1;
5809 ilo = 0;
5810 }
5811 else if (byteorder == 1) {
5812 /* force BE */
5813 ihi = 0;
5814 ilo = 1;
5815 }
5816
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005817 for (i = 0; i < len; i++) {
5818 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5819 Py_UCS4 ch2 = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +00005820 if (ch >= 0x10000) {
Victor Stinner551ac952011-11-29 22:58:13 +01005821 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
5822 ch = Py_UNICODE_HIGH_SURROGATE(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00005823 }
Tim Peters772747b2001-08-09 22:21:55 +00005824 STORECHAR(ch);
5825 if (ch2)
5826 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005827 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005828
5829 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005830 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005831#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005832}
5833
Alexander Belopolsky40018472011-02-26 01:02:56 +00005834PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005835PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5836 Py_ssize_t size,
5837 const char *errors,
5838 int byteorder)
5839{
5840 PyObject *result;
5841 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5842 if (tmp == NULL)
5843 return NULL;
5844 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5845 Py_DECREF(tmp);
5846 return result;
5847}
5848
5849PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005850PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005851{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005852 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005853}
5854
5855/* --- Unicode Escape Codec ----------------------------------------------- */
5856
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005857/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5858 if all the escapes in the string make it still a valid ASCII string.
5859 Returns -1 if any escapes were found which cause the string to
5860 pop out of ASCII range. Otherwise returns the length of the
5861 required buffer to hold the string.
5862 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005863static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005864length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5865{
5866 const unsigned char *p = (const unsigned char *)s;
5867 const unsigned char *end = p + size;
5868 Py_ssize_t length = 0;
5869
5870 if (size < 0)
5871 return -1;
5872
5873 for (; p < end; ++p) {
5874 if (*p > 127) {
5875 /* Non-ASCII */
5876 return -1;
5877 }
5878 else if (*p != '\\') {
5879 /* Normal character */
5880 ++length;
5881 }
5882 else {
5883 /* Backslash-escape, check next char */
5884 ++p;
5885 /* Escape sequence reaches till end of string or
5886 non-ASCII follow-up. */
5887 if (p >= end || *p > 127)
5888 return -1;
5889 switch (*p) {
5890 case '\n':
5891 /* backslash + \n result in zero characters */
5892 break;
5893 case '\\': case '\'': case '\"':
5894 case 'b': case 'f': case 't':
5895 case 'n': case 'r': case 'v': case 'a':
5896 ++length;
5897 break;
5898 case '0': case '1': case '2': case '3':
5899 case '4': case '5': case '6': case '7':
5900 case 'x': case 'u': case 'U': case 'N':
5901 /* these do not guarantee ASCII characters */
5902 return -1;
5903 default:
5904 /* count the backslash + the other character */
5905 length += 2;
5906 }
5907 }
5908 }
5909 return length;
5910}
5911
Fredrik Lundh06d12682001-01-24 07:59:11 +00005912static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005913
Alexander Belopolsky40018472011-02-26 01:02:56 +00005914PyObject *
5915PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005916 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005917 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005918{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005919 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005920 Py_ssize_t startinpos;
5921 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005922 int j;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005923 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005924 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005925 char* message;
5926 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005927 PyObject *errorHandler = NULL;
5928 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005929 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005930 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005931
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005932 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005933
5934 /* After length_of_escaped_ascii_string() there are two alternatives,
5935 either the string is pure ASCII with named escapes like \n, etc.
5936 and we determined it's exact size (common case)
5937 or it contains \x, \u, ... escape sequences. then we create a
5938 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005939 if (len >= 0) {
5940 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005941 if (!v)
5942 goto onError;
5943 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005944 }
5945 else {
5946 /* Escaped strings will always be longer than the resulting
5947 Unicode string, so we start with size here and then reduce the
5948 length after conversion to the true value.
5949 (but if the error callback returns a long replacement string
5950 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005951 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005952 if (!v)
5953 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005954 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005955 }
5956
Guido van Rossumd57fd912000-03-10 22:53:23 +00005957 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005958 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005959 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005960 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005961
Guido van Rossumd57fd912000-03-10 22:53:23 +00005962 while (s < end) {
5963 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005964 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005965 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005966
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005967 /* The only case in which i == ascii_length is a backslash
5968 followed by a newline. */
5969 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005970
Guido van Rossumd57fd912000-03-10 22:53:23 +00005971 /* Non-escape characters are interpreted as Unicode ordinals */
5972 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005973 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5974 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005975 continue;
5976 }
5977
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005978 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005979 /* \ - Escapes */
5980 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005981 c = *s++;
5982 if (s > end)
5983 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005984
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005985 /* The only case in which i == ascii_length is a backslash
5986 followed by a newline. */
5987 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005988
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005989 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005990
Benjamin Peterson29060642009-01-31 22:14:21 +00005991 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005992#define WRITECHAR(ch) \
5993 do { \
5994 if (unicode_putchar(&v, &i, ch) < 0) \
5995 goto onError; \
5996 }while(0)
5997
Guido van Rossumd57fd912000-03-10 22:53:23 +00005998 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005999 case '\\': WRITECHAR('\\'); break;
6000 case '\'': WRITECHAR('\''); break;
6001 case '\"': WRITECHAR('\"'); break;
6002 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006003 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006004 case 'f': WRITECHAR('\014'); break;
6005 case 't': WRITECHAR('\t'); break;
6006 case 'n': WRITECHAR('\n'); break;
6007 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006008 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006009 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006010 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006011 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006012
Benjamin Peterson29060642009-01-31 22:14:21 +00006013 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006014 case '0': case '1': case '2': case '3':
6015 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00006016 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006017 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00006018 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006019 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00006020 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006021 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006022 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006023 break;
6024
Benjamin Peterson29060642009-01-31 22:14:21 +00006025 /* hex escapes */
6026 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006027 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006028 digits = 2;
6029 message = "truncated \\xXX escape";
6030 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006031
Benjamin Peterson29060642009-01-31 22:14:21 +00006032 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006033 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006034 digits = 4;
6035 message = "truncated \\uXXXX escape";
6036 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006037
Benjamin Peterson29060642009-01-31 22:14:21 +00006038 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006039 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006040 digits = 8;
6041 message = "truncated \\UXXXXXXXX escape";
6042 hexescape:
6043 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006044 if (s+digits>end) {
6045 endinpos = size;
6046 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00006047 errors, &errorHandler,
6048 "unicodeescape", "end of string in escape sequence",
6049 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006050 &v, &i))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006051 goto onError;
6052 goto nextByte;
6053 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006054 for (j = 0; j < digits; ++j) {
6055 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00006056 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006057 endinpos = (s+j+1)-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006058 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00006059 errors, &errorHandler,
6060 "unicodeescape", message,
6061 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006062 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00006063 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006064 len = PyUnicode_GET_LENGTH(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006065 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00006066 }
6067 chr = (chr<<4) & ~0xF;
6068 if (c >= '0' && c <= '9')
6069 chr += c - '0';
6070 else if (c >= 'a' && c <= 'f')
6071 chr += 10 + c - 'a';
6072 else
6073 chr += 10 + c - 'A';
6074 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006075 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00006076 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006077 /* _decoding_error will have already written into the
6078 target buffer. */
6079 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006080 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00006081 /* when we get here, chr is a 32-bit unicode character */
Victor Stinner8faf8212011-12-08 22:14:11 +01006082 if (chr <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006083 WRITECHAR(chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00006084 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006085 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006086 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00006087 errors, &errorHandler,
6088 "unicodeescape", "illegal Unicode character",
6089 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006090 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00006091 goto onError;
6092 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006093 break;
6094
Benjamin Peterson29060642009-01-31 22:14:21 +00006095 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006096 case 'N':
6097 message = "malformed \\N character escape";
6098 if (ucnhash_CAPI == NULL) {
6099 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006100 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6101 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00006102 if (ucnhash_CAPI == NULL)
6103 goto ucnhashError;
6104 }
6105 if (*s == '{') {
6106 const char *start = s+1;
6107 /* look for the closing brace */
6108 while (*s != '}' && s < end)
6109 s++;
6110 if (s > start && s < end && *s == '}') {
6111 /* found a name. look it up in the unicode database */
6112 message = "unknown Unicode character name";
6113 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006114 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03006115 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00006116 goto store;
6117 }
6118 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006119 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006120 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00006121 errors, &errorHandler,
6122 "unicodeescape", message,
6123 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006124 &v, &i))
Fredrik Lundhccc74732001-02-18 22:13:49 +00006125 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006126 break;
6127
6128 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00006129 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006130 message = "\\ at end of string";
6131 s--;
6132 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006133 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00006134 errors, &errorHandler,
6135 "unicodeescape", message,
6136 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006137 &v, &i))
Walter Dörwald8c077222002-03-25 11:16:18 +00006138 goto onError;
6139 }
6140 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006141 WRITECHAR('\\');
6142 WRITECHAR(s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00006143 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006144 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006145 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006146 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006147 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006148 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006149#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006150
Victor Stinner16e6a802011-12-12 13:24:15 +01006151 if (unicode_resize(&v, i) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006152 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006153 Py_XDECREF(errorHandler);
6154 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006155 return unicode_result(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00006156
Benjamin Peterson29060642009-01-31 22:14:21 +00006157 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00006158 PyErr_SetString(
6159 PyExc_UnicodeError,
6160 "\\N escapes not supported (can't load unicodedata module)"
6161 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00006162 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006163 Py_XDECREF(errorHandler);
6164 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00006165 return NULL;
6166
Benjamin Peterson29060642009-01-31 22:14:21 +00006167 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006168 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006169 Py_XDECREF(errorHandler);
6170 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006171 return NULL;
6172}
6173
6174/* Return a Unicode-Escape string version of the Unicode object.
6175
6176 If quotes is true, the string is enclosed in u"" or u'' quotes as
6177 appropriate.
6178
6179*/
6180
Alexander Belopolsky40018472011-02-26 01:02:56 +00006181PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006182PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006183{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006184 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006185 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006186 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006187 int kind;
6188 void *data;
6189 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006190
Thomas Wouters89f507f2006-12-13 04:49:30 +00006191 /* Initial allocation is based on the longest-possible unichr
6192 escape.
6193
6194 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
6195 unichr, so in this case it's the longest unichr escape. In
6196 narrow (UTF-16) builds this is five chars per source unichr
6197 since there are two unichrs in the surrogate pair, so in narrow
6198 (UTF-16) builds it's not the longest unichr escape.
6199
6200 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
6201 so in the narrow (UTF-16) build case it's the longest unichr
6202 escape.
6203 */
6204
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006205 if (!PyUnicode_Check(unicode)) {
6206 PyErr_BadArgument();
6207 return NULL;
6208 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006209 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006210 return NULL;
6211 len = PyUnicode_GET_LENGTH(unicode);
6212 kind = PyUnicode_KIND(unicode);
6213 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06006214 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006215 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
6216 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
6217 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
6218 }
6219
6220 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006221 return PyBytes_FromStringAndSize(NULL, 0);
6222
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006223 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006224 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006225
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006226 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00006227 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006228 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00006229 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006230 if (repr == NULL)
6231 return NULL;
6232
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006233 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006234
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006235 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006236 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006237
Walter Dörwald79e913e2007-05-12 11:08:06 +00006238 /* Escape backslashes */
6239 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006240 *p++ = '\\';
6241 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00006242 continue;
Tim Petersced69f82003-09-16 20:30:58 +00006243 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006244
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006245 /* Map 21-bit characters to '\U00xxxxxx' */
6246 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006247 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006248 *p++ = '\\';
6249 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006250 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
6251 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
6252 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6253 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6254 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6255 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6256 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6257 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00006258 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006259 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006260
Guido van Rossumd57fd912000-03-10 22:53:23 +00006261 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006262 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006263 *p++ = '\\';
6264 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006265 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6266 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6267 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6268 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006269 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006270
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006271 /* Map special whitespace to '\t', \n', '\r' */
6272 else if (ch == '\t') {
6273 *p++ = '\\';
6274 *p++ = 't';
6275 }
6276 else if (ch == '\n') {
6277 *p++ = '\\';
6278 *p++ = 'n';
6279 }
6280 else if (ch == '\r') {
6281 *p++ = '\\';
6282 *p++ = 'r';
6283 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006284
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006285 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006286 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006287 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006288 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006289 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6290 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006291 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006292
Guido van Rossumd57fd912000-03-10 22:53:23 +00006293 /* Copy everything else as-is */
6294 else
6295 *p++ = (char) ch;
6296 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006297
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006298 assert(p - PyBytes_AS_STRING(repr) > 0);
6299 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6300 return NULL;
6301 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006302}
6303
Alexander Belopolsky40018472011-02-26 01:02:56 +00006304PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006305PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6306 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006307{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006308 PyObject *result;
6309 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6310 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006311 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006312 result = PyUnicode_AsUnicodeEscapeString(tmp);
6313 Py_DECREF(tmp);
6314 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006315}
6316
6317/* --- Raw Unicode Escape Codec ------------------------------------------- */
6318
Alexander Belopolsky40018472011-02-26 01:02:56 +00006319PyObject *
6320PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006321 Py_ssize_t size,
6322 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006323{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006324 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006325 Py_ssize_t startinpos;
6326 Py_ssize_t endinpos;
6327 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006328 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006329 const char *end;
6330 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006331 PyObject *errorHandler = NULL;
6332 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006333
Guido van Rossumd57fd912000-03-10 22:53:23 +00006334 /* Escaped strings will always be longer than the resulting
6335 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006336 length after conversion to the true value. (But decoding error
6337 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006338 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006339 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006340 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006341 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006342 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006343 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006344 end = s + size;
6345 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006346 unsigned char c;
6347 Py_UCS4 x;
6348 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006349 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006350
Benjamin Peterson29060642009-01-31 22:14:21 +00006351 /* Non-escape characters are interpreted as Unicode ordinals */
6352 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006353 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6354 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006355 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006356 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006357 startinpos = s-starts;
6358
6359 /* \u-escapes are only interpreted iff the number of leading
6360 backslashes if odd */
6361 bs = s;
6362 for (;s < end;) {
6363 if (*s != '\\')
6364 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006365 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6366 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006367 }
6368 if (((s - bs) & 1) == 0 ||
6369 s >= end ||
6370 (*s != 'u' && *s != 'U')) {
6371 continue;
6372 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006373 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006374 count = *s=='u' ? 4 : 8;
6375 s++;
6376
6377 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006378 for (x = 0, i = 0; i < count; ++i, ++s) {
6379 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006380 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006381 endinpos = s-starts;
6382 if (unicode_decode_call_errorhandler(
6383 errors, &errorHandler,
6384 "rawunicodeescape", "truncated \\uXXXX",
6385 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006386 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006387 goto onError;
6388 goto nextByte;
6389 }
6390 x = (x<<4) & ~0xF;
6391 if (c >= '0' && c <= '9')
6392 x += c - '0';
6393 else if (c >= 'a' && c <= 'f')
6394 x += 10 + c - 'a';
6395 else
6396 x += 10 + c - 'A';
6397 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006398 if (x <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006399 if (unicode_putchar(&v, &outpos, x) < 0)
6400 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006401 } else {
6402 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006403 if (unicode_decode_call_errorhandler(
6404 errors, &errorHandler,
6405 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006406 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006407 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006408 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006409 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006410 nextByte:
6411 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006412 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006413 if (unicode_resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006414 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006415 Py_XDECREF(errorHandler);
6416 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006417 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00006418
Benjamin Peterson29060642009-01-31 22:14:21 +00006419 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006420 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006421 Py_XDECREF(errorHandler);
6422 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006423 return NULL;
6424}
6425
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006426
Alexander Belopolsky40018472011-02-26 01:02:56 +00006427PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006428PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006429{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006430 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006431 char *p;
6432 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006433 Py_ssize_t expandsize, pos;
6434 int kind;
6435 void *data;
6436 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006437
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006438 if (!PyUnicode_Check(unicode)) {
6439 PyErr_BadArgument();
6440 return NULL;
6441 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006442 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006443 return NULL;
6444 kind = PyUnicode_KIND(unicode);
6445 data = PyUnicode_DATA(unicode);
6446 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006447 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6448 bytes, and 1 byte characters 4. */
6449 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006450
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006451 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006452 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006453
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006454 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006455 if (repr == NULL)
6456 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006457 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006458 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006460 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006461 for (pos = 0; pos < len; pos++) {
6462 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006463 /* Map 32-bit characters to '\Uxxxxxxxx' */
6464 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006465 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006466 *p++ = '\\';
6467 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006468 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6469 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6470 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6471 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6472 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6473 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6474 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6475 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006476 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006477 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006478 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006479 *p++ = '\\';
6480 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006481 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6482 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6483 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6484 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006485 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006486 /* Copy everything else as-is */
6487 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006488 *p++ = (char) ch;
6489 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006490
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006491 assert(p > q);
6492 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006493 return NULL;
6494 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006495}
6496
Alexander Belopolsky40018472011-02-26 01:02:56 +00006497PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006498PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6499 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006500{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006501 PyObject *result;
6502 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6503 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006504 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006505 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6506 Py_DECREF(tmp);
6507 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006508}
6509
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006510/* --- Unicode Internal Codec ------------------------------------------- */
6511
Alexander Belopolsky40018472011-02-26 01:02:56 +00006512PyObject *
6513_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006514 Py_ssize_t size,
6515 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006516{
6517 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006518 Py_ssize_t startinpos;
6519 Py_ssize_t endinpos;
6520 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006521 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006522 const char *end;
6523 const char *reason;
6524 PyObject *errorHandler = NULL;
6525 PyObject *exc = NULL;
6526
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006527 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006528 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006529 1))
6530 return NULL;
6531
Thomas Wouters89f507f2006-12-13 04:49:30 +00006532 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006533 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006534 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006535 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006536 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006537 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006538 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006539 end = s + size;
6540
6541 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006542 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006543 Py_UCS4 ch;
6544 /* We copy the raw representation one byte at a time because the
6545 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006546 ((char *) &uch)[0] = s[0];
6547 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006548#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006549 ((char *) &uch)[2] = s[2];
6550 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006551#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006552 ch = uch;
6553
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006554 /* We have to sanity check the raw data, otherwise doom looms for
6555 some malformed UCS-4 data. */
6556 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006557#ifdef Py_UNICODE_WIDE
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006558 ch > 0x10ffff ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006559#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006560 end-s < Py_UNICODE_SIZE
6561 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006562 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006563 startinpos = s - starts;
6564 if (end-s < Py_UNICODE_SIZE) {
6565 endinpos = end-starts;
6566 reason = "truncated input";
6567 }
6568 else {
6569 endinpos = s - starts + Py_UNICODE_SIZE;
6570 reason = "illegal code point (> 0x10FFFF)";
6571 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006572 if (unicode_decode_call_errorhandler(
6573 errors, &errorHandler,
6574 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006575 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006576 &v, &outpos))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006577 goto onError;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006578 continue;
6579 }
6580
6581 s += Py_UNICODE_SIZE;
6582#ifndef Py_UNICODE_WIDE
Victor Stinner551ac952011-11-29 22:58:13 +01006583 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006584 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006585 Py_UNICODE uch2;
6586 ((char *) &uch2)[0] = s[0];
6587 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006588 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006589 {
Victor Stinner551ac952011-11-29 22:58:13 +01006590 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006591 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006592 }
6593 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006594#endif
6595
6596 if (unicode_putchar(&v, &outpos, ch) < 0)
6597 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006598 }
6599
Victor Stinner16e6a802011-12-12 13:24:15 +01006600 if (unicode_resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006601 goto onError;
6602 Py_XDECREF(errorHandler);
6603 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006604 return unicode_result(v);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006605
Benjamin Peterson29060642009-01-31 22:14:21 +00006606 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006607 Py_XDECREF(v);
6608 Py_XDECREF(errorHandler);
6609 Py_XDECREF(exc);
6610 return NULL;
6611}
6612
Guido van Rossumd57fd912000-03-10 22:53:23 +00006613/* --- Latin-1 Codec ------------------------------------------------------ */
6614
Alexander Belopolsky40018472011-02-26 01:02:56 +00006615PyObject *
6616PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006617 Py_ssize_t size,
6618 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006619{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006620 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006621 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006622}
6623
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006624/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006625static void
6626make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006627 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006628 PyObject *unicode,
6629 Py_ssize_t startpos, Py_ssize_t endpos,
6630 const char *reason)
6631{
6632 if (*exceptionObject == NULL) {
6633 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006634 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006635 encoding, unicode, startpos, endpos, reason);
6636 }
6637 else {
6638 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6639 goto onError;
6640 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6641 goto onError;
6642 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6643 goto onError;
6644 return;
6645 onError:
6646 Py_DECREF(*exceptionObject);
6647 *exceptionObject = NULL;
6648 }
6649}
6650
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006651/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006652static void
6653raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006654 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006655 PyObject *unicode,
6656 Py_ssize_t startpos, Py_ssize_t endpos,
6657 const char *reason)
6658{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006659 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006660 encoding, unicode, startpos, endpos, reason);
6661 if (*exceptionObject != NULL)
6662 PyCodec_StrictErrors(*exceptionObject);
6663}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006664
6665/* error handling callback helper:
6666 build arguments, call the callback and check the arguments,
6667 put the result into newpos and return the replacement string, which
6668 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006669static PyObject *
6670unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006671 PyObject **errorHandler,
6672 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006673 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006674 Py_ssize_t startpos, Py_ssize_t endpos,
6675 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006676{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006677 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006678 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006679 PyObject *restuple;
6680 PyObject *resunicode;
6681
6682 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006683 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006684 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006685 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006686 }
6687
Benjamin Petersonbac79492012-01-14 13:34:47 -05006688 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006689 return NULL;
6690 len = PyUnicode_GET_LENGTH(unicode);
6691
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006692 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006693 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006694 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006695 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006696
6697 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006698 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006699 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006700 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006701 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006702 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006703 Py_DECREF(restuple);
6704 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006705 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006706 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006707 &resunicode, newpos)) {
6708 Py_DECREF(restuple);
6709 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006710 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006711 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6712 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6713 Py_DECREF(restuple);
6714 return NULL;
6715 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006716 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006717 *newpos = len + *newpos;
6718 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006719 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6720 Py_DECREF(restuple);
6721 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006722 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006723 Py_INCREF(resunicode);
6724 Py_DECREF(restuple);
6725 return resunicode;
6726}
6727
Alexander Belopolsky40018472011-02-26 01:02:56 +00006728static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006729unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006730 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006731 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006732{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006733 /* input state */
6734 Py_ssize_t pos=0, size;
6735 int kind;
6736 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006737 /* output object */
6738 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006739 /* pointer into the output */
6740 char *str;
6741 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006742 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006743 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6744 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006745 PyObject *errorHandler = NULL;
6746 PyObject *exc = NULL;
6747 /* the following variable is used for caching string comparisons
6748 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6749 int known_errorHandler = -1;
6750
Benjamin Petersonbac79492012-01-14 13:34:47 -05006751 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006752 return NULL;
6753 size = PyUnicode_GET_LENGTH(unicode);
6754 kind = PyUnicode_KIND(unicode);
6755 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006756 /* allocate enough for a simple encoding without
6757 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006758 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006759 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006760 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006761 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006762 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006763 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006764 ressize = size;
6765
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006766 while (pos < size) {
6767 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006768
Benjamin Peterson29060642009-01-31 22:14:21 +00006769 /* can we encode this? */
6770 if (c<limit) {
6771 /* no overflow check, because we know that the space is enough */
6772 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006773 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006774 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006775 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006776 Py_ssize_t requiredsize;
6777 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006778 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006779 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006780 Py_ssize_t collstart = pos;
6781 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006782 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006783 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006784 ++collend;
6785 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6786 if (known_errorHandler==-1) {
6787 if ((errors==NULL) || (!strcmp(errors, "strict")))
6788 known_errorHandler = 1;
6789 else if (!strcmp(errors, "replace"))
6790 known_errorHandler = 2;
6791 else if (!strcmp(errors, "ignore"))
6792 known_errorHandler = 3;
6793 else if (!strcmp(errors, "xmlcharrefreplace"))
6794 known_errorHandler = 4;
6795 else
6796 known_errorHandler = 0;
6797 }
6798 switch (known_errorHandler) {
6799 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006800 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006801 goto onError;
6802 case 2: /* replace */
6803 while (collstart++<collend)
6804 *str++ = '?'; /* fall through */
6805 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006806 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006807 break;
6808 case 4: /* xmlcharrefreplace */
6809 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006810 /* determine replacement size */
6811 for (i = collstart, repsize = 0; i < collend; ++i) {
6812 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6813 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006814 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006815 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006816 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006817 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006818 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006819 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006820 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006821 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006822 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006823 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006824 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006825 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006826 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006827 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006828 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006829 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006830 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006831 if (requiredsize > ressize) {
6832 if (requiredsize<2*ressize)
6833 requiredsize = 2*ressize;
6834 if (_PyBytes_Resize(&res, requiredsize))
6835 goto onError;
6836 str = PyBytes_AS_STRING(res) + respos;
6837 ressize = requiredsize;
6838 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006839 /* generate replacement */
6840 for (i = collstart; i < collend; ++i) {
6841 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006842 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006843 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006844 break;
6845 default:
6846 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006847 encoding, reason, unicode, &exc,
6848 collstart, collend, &newpos);
6849 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006850 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006851 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006852 if (PyBytes_Check(repunicode)) {
6853 /* Directly copy bytes result to output. */
6854 repsize = PyBytes_Size(repunicode);
6855 if (repsize > 1) {
6856 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006857 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006858 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6859 Py_DECREF(repunicode);
6860 goto onError;
6861 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006862 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006863 ressize += repsize-1;
6864 }
6865 memcpy(str, PyBytes_AsString(repunicode), repsize);
6866 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006867 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006868 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006869 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006870 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006871 /* need more space? (at least enough for what we
6872 have+the replacement+the rest of the string, so
6873 we won't have to check space for encodable characters) */
6874 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006875 repsize = PyUnicode_GET_LENGTH(repunicode);
6876 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006877 if (requiredsize > ressize) {
6878 if (requiredsize<2*ressize)
6879 requiredsize = 2*ressize;
6880 if (_PyBytes_Resize(&res, requiredsize)) {
6881 Py_DECREF(repunicode);
6882 goto onError;
6883 }
6884 str = PyBytes_AS_STRING(res) + respos;
6885 ressize = requiredsize;
6886 }
6887 /* check if there is anything unencodable in the replacement
6888 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006889 for (i = 0; repsize-->0; ++i, ++str) {
6890 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006891 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006892 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006893 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006894 Py_DECREF(repunicode);
6895 goto onError;
6896 }
6897 *str = (char)c;
6898 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006899 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006900 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006901 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006902 }
6903 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006904 /* Resize if we allocated to much */
6905 size = str - PyBytes_AS_STRING(res);
6906 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006907 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006908 if (_PyBytes_Resize(&res, size) < 0)
6909 goto onError;
6910 }
6911
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006912 Py_XDECREF(errorHandler);
6913 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006914 return res;
6915
6916 onError:
6917 Py_XDECREF(res);
6918 Py_XDECREF(errorHandler);
6919 Py_XDECREF(exc);
6920 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006921}
6922
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006923/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006924PyObject *
6925PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006926 Py_ssize_t size,
6927 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006928{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006929 PyObject *result;
6930 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6931 if (unicode == NULL)
6932 return NULL;
6933 result = unicode_encode_ucs1(unicode, errors, 256);
6934 Py_DECREF(unicode);
6935 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006936}
6937
Alexander Belopolsky40018472011-02-26 01:02:56 +00006938PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006939_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006940{
6941 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006942 PyErr_BadArgument();
6943 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006944 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006945 if (PyUnicode_READY(unicode) == -1)
6946 return NULL;
6947 /* Fast path: if it is a one-byte string, construct
6948 bytes object directly. */
6949 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6950 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6951 PyUnicode_GET_LENGTH(unicode));
6952 /* Non-Latin-1 characters present. Defer to above function to
6953 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006954 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006955}
6956
6957PyObject*
6958PyUnicode_AsLatin1String(PyObject *unicode)
6959{
6960 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006961}
6962
6963/* --- 7-bit ASCII Codec -------------------------------------------------- */
6964
Alexander Belopolsky40018472011-02-26 01:02:56 +00006965PyObject *
6966PyUnicode_DecodeASCII(const char *s,
6967 Py_ssize_t size,
6968 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006969{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006970 const char *starts = s;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006971 PyObject *v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006972 int kind;
6973 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006974 Py_ssize_t startinpos;
6975 Py_ssize_t endinpos;
6976 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006977 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006978 int has_error;
6979 const unsigned char *p = (const unsigned char *)s;
6980 const unsigned char *end = p + size;
6981 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006982 PyObject *errorHandler = NULL;
6983 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006984
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006985 if (size == 0) {
6986 Py_INCREF(unicode_empty);
6987 return unicode_empty;
6988 }
6989
Guido van Rossumd57fd912000-03-10 22:53:23 +00006990 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006991 if (size == 1 && (unsigned char)s[0] < 128)
6992 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006993
Victor Stinner702c7342011-10-05 13:50:52 +02006994 has_error = 0;
6995 while (p < end && !has_error) {
6996 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6997 an explanation. */
6998 if (!((size_t) p & LONG_PTR_MASK)) {
6999 /* Help register allocation */
7000 register const unsigned char *_p = p;
7001 while (_p < aligned_end) {
7002 unsigned long value = *(unsigned long *) _p;
7003 if (value & ASCII_CHAR_MASK) {
7004 has_error = 1;
7005 break;
7006 }
7007 _p += SIZEOF_LONG;
7008 }
7009 if (_p == end)
7010 break;
7011 if (has_error)
7012 break;
7013 p = _p;
7014 }
7015 if (*p & 0x80) {
7016 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007017 break;
Victor Stinner702c7342011-10-05 13:50:52 +02007018 }
7019 else {
7020 ++p;
7021 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007022 }
Victor Stinner702c7342011-10-05 13:50:52 +02007023 if (!has_error)
7024 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00007025
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007026 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007027 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007028 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007029 kind = PyUnicode_KIND(v);
7030 data = PyUnicode_DATA(v);
7031 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007032 e = s + size;
7033 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007034 register unsigned char c = (unsigned char)*s;
7035 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007036 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00007037 ++s;
7038 }
7039 else {
7040 startinpos = s-starts;
7041 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00007042 if (unicode_decode_call_errorhandler(
7043 errors, &errorHandler,
7044 "ascii", "ordinal not in range(128)",
7045 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007046 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00007047 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007048 kind = PyUnicode_KIND(v);
7049 data = PyUnicode_DATA(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00007050 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007051 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007052 if (unicode_resize(&v, outpos) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007053 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007054 Py_XDECREF(errorHandler);
7055 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02007056 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01007057 return v;
Tim Petersced69f82003-09-16 20:30:58 +00007058
Benjamin Peterson29060642009-01-31 22:14:21 +00007059 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007060 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007061 Py_XDECREF(errorHandler);
7062 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007063 return NULL;
7064}
7065
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007066/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007067PyObject *
7068PyUnicode_EncodeASCII(const Py_UNICODE *p,
7069 Py_ssize_t size,
7070 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007071{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007072 PyObject *result;
7073 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7074 if (unicode == NULL)
7075 return NULL;
7076 result = unicode_encode_ucs1(unicode, errors, 128);
7077 Py_DECREF(unicode);
7078 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007079}
7080
Alexander Belopolsky40018472011-02-26 01:02:56 +00007081PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007082_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007083{
7084 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007085 PyErr_BadArgument();
7086 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007087 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007088 if (PyUnicode_READY(unicode) == -1)
7089 return NULL;
7090 /* Fast path: if it is an ASCII-only string, construct bytes object
7091 directly. Else defer to above function to raise the exception. */
7092 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
7093 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7094 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007095 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007096}
7097
7098PyObject *
7099PyUnicode_AsASCIIString(PyObject *unicode)
7100{
7101 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007102}
7103
Victor Stinner99b95382011-07-04 14:23:54 +02007104#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007105
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007106/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007107
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007108#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007109#define NEED_RETRY
7110#endif
7111
Victor Stinner3a50e702011-10-18 21:21:00 +02007112#ifndef WC_ERR_INVALID_CHARS
7113# define WC_ERR_INVALID_CHARS 0x0080
7114#endif
7115
7116static char*
7117code_page_name(UINT code_page, PyObject **obj)
7118{
7119 *obj = NULL;
7120 if (code_page == CP_ACP)
7121 return "mbcs";
7122 if (code_page == CP_UTF7)
7123 return "CP_UTF7";
7124 if (code_page == CP_UTF8)
7125 return "CP_UTF8";
7126
7127 *obj = PyBytes_FromFormat("cp%u", code_page);
7128 if (*obj == NULL)
7129 return NULL;
7130 return PyBytes_AS_STRING(*obj);
7131}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007132
Alexander Belopolsky40018472011-02-26 01:02:56 +00007133static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007134is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007135{
7136 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02007137 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007138
Victor Stinner3a50e702011-10-18 21:21:00 +02007139 if (!IsDBCSLeadByteEx(code_page, *curr))
7140 return 0;
7141
7142 prev = CharPrevExA(code_page, s, curr, 0);
7143 if (prev == curr)
7144 return 1;
7145 /* FIXME: This code is limited to "true" double-byte encodings,
7146 as it assumes an incomplete character consists of a single
7147 byte. */
7148 if (curr - prev == 2)
7149 return 1;
7150 if (!IsDBCSLeadByteEx(code_page, *prev))
7151 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007152 return 0;
7153}
7154
Victor Stinner3a50e702011-10-18 21:21:00 +02007155static DWORD
7156decode_code_page_flags(UINT code_page)
7157{
7158 if (code_page == CP_UTF7) {
7159 /* The CP_UTF7 decoder only supports flags=0 */
7160 return 0;
7161 }
7162 else
7163 return MB_ERR_INVALID_CHARS;
7164}
7165
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007166/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007167 * Decode a byte string from a Windows code page into unicode object in strict
7168 * mode.
7169 *
7170 * Returns consumed size if succeed, returns -2 on decode error, or raise a
7171 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007172 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007173static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007174decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007175 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007176 const char *in,
7177 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007178{
Victor Stinner3a50e702011-10-18 21:21:00 +02007179 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007180 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007181 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007182
7183 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007184 assert(insize > 0);
7185 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7186 if (outsize <= 0)
7187 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007188
7189 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007190 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007191 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007192 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007193 if (*v == NULL)
7194 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007195 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007196 }
7197 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007198 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007199 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007200 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007201 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007202 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007203 }
7204
7205 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007206 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7207 if (outsize <= 0)
7208 goto error;
7209 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007210
Victor Stinner3a50e702011-10-18 21:21:00 +02007211error:
7212 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7213 return -2;
7214 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007215 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007216}
7217
Victor Stinner3a50e702011-10-18 21:21:00 +02007218/*
7219 * Decode a byte string from a code page into unicode object with an error
7220 * handler.
7221 *
7222 * Returns consumed size if succeed, or raise a WindowsError or
7223 * UnicodeDecodeError exception and returns -1 on error.
7224 */
7225static int
7226decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007227 PyObject **v,
7228 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007229 const char *errors)
7230{
7231 const char *startin = in;
7232 const char *endin = in + size;
7233 const DWORD flags = decode_code_page_flags(code_page);
7234 /* Ideally, we should get reason from FormatMessage. This is the Windows
7235 2000 English version of the message. */
7236 const char *reason = "No mapping for the Unicode character exists "
7237 "in the target code page.";
7238 /* each step cannot decode more than 1 character, but a character can be
7239 represented as a surrogate pair */
7240 wchar_t buffer[2], *startout, *out;
7241 int insize, outsize;
7242 PyObject *errorHandler = NULL;
7243 PyObject *exc = NULL;
7244 PyObject *encoding_obj = NULL;
7245 char *encoding;
7246 DWORD err;
7247 int ret = -1;
7248
7249 assert(size > 0);
7250
7251 encoding = code_page_name(code_page, &encoding_obj);
7252 if (encoding == NULL)
7253 return -1;
7254
7255 if (errors == NULL || strcmp(errors, "strict") == 0) {
7256 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7257 UnicodeDecodeError. */
7258 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7259 if (exc != NULL) {
7260 PyCodec_StrictErrors(exc);
7261 Py_CLEAR(exc);
7262 }
7263 goto error;
7264 }
7265
7266 if (*v == NULL) {
7267 /* Create unicode object */
7268 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7269 PyErr_NoMemory();
7270 goto error;
7271 }
Victor Stinnerab595942011-12-17 04:59:06 +01007272 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007273 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007274 if (*v == NULL)
7275 goto error;
7276 startout = PyUnicode_AS_UNICODE(*v);
7277 }
7278 else {
7279 /* Extend unicode object */
7280 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7281 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7282 PyErr_NoMemory();
7283 goto error;
7284 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007285 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007286 goto error;
7287 startout = PyUnicode_AS_UNICODE(*v) + n;
7288 }
7289
7290 /* Decode the byte string character per character */
7291 out = startout;
7292 while (in < endin)
7293 {
7294 /* Decode a character */
7295 insize = 1;
7296 do
7297 {
7298 outsize = MultiByteToWideChar(code_page, flags,
7299 in, insize,
7300 buffer, Py_ARRAY_LENGTH(buffer));
7301 if (outsize > 0)
7302 break;
7303 err = GetLastError();
7304 if (err != ERROR_NO_UNICODE_TRANSLATION
7305 && err != ERROR_INSUFFICIENT_BUFFER)
7306 {
7307 PyErr_SetFromWindowsErr(0);
7308 goto error;
7309 }
7310 insize++;
7311 }
7312 /* 4=maximum length of a UTF-8 sequence */
7313 while (insize <= 4 && (in + insize) <= endin);
7314
7315 if (outsize <= 0) {
7316 Py_ssize_t startinpos, endinpos, outpos;
7317
7318 startinpos = in - startin;
7319 endinpos = startinpos + 1;
7320 outpos = out - PyUnicode_AS_UNICODE(*v);
7321 if (unicode_decode_call_errorhandler(
7322 errors, &errorHandler,
7323 encoding, reason,
7324 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007325 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007326 {
7327 goto error;
7328 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007329 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007330 }
7331 else {
7332 in += insize;
7333 memcpy(out, buffer, outsize * sizeof(wchar_t));
7334 out += outsize;
7335 }
7336 }
7337
7338 /* write a NUL character at the end */
7339 *out = 0;
7340
7341 /* Extend unicode object */
7342 outsize = out - startout;
7343 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007344 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007345 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01007346 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007347
7348error:
7349 Py_XDECREF(encoding_obj);
7350 Py_XDECREF(errorHandler);
7351 Py_XDECREF(exc);
7352 return ret;
7353}
7354
Victor Stinner3a50e702011-10-18 21:21:00 +02007355static PyObject *
7356decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007357 const char *s, Py_ssize_t size,
7358 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007359{
Victor Stinner76a31a62011-11-04 00:05:13 +01007360 PyObject *v = NULL;
7361 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007362
Victor Stinner3a50e702011-10-18 21:21:00 +02007363 if (code_page < 0) {
7364 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7365 return NULL;
7366 }
7367
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007368 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007369 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007370
Victor Stinner76a31a62011-11-04 00:05:13 +01007371 do
7372 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007373#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007374 if (size > INT_MAX) {
7375 chunk_size = INT_MAX;
7376 final = 0;
7377 done = 0;
7378 }
7379 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007380#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007381 {
7382 chunk_size = (int)size;
7383 final = (consumed == NULL);
7384 done = 1;
7385 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007386
Victor Stinner76a31a62011-11-04 00:05:13 +01007387 /* Skip trailing lead-byte unless 'final' is set */
7388 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
7389 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007390
Victor Stinner76a31a62011-11-04 00:05:13 +01007391 if (chunk_size == 0 && done) {
7392 if (v != NULL)
7393 break;
7394 Py_INCREF(unicode_empty);
7395 return unicode_empty;
7396 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007397
Victor Stinner76a31a62011-11-04 00:05:13 +01007398
7399 converted = decode_code_page_strict(code_page, &v,
7400 s, chunk_size);
7401 if (converted == -2)
7402 converted = decode_code_page_errors(code_page, &v,
7403 s, chunk_size,
7404 errors);
7405 assert(converted != 0);
7406
7407 if (converted < 0) {
7408 Py_XDECREF(v);
7409 return NULL;
7410 }
7411
7412 if (consumed)
7413 *consumed += converted;
7414
7415 s += converted;
7416 size -= converted;
7417 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007418
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007419 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007420}
7421
Alexander Belopolsky40018472011-02-26 01:02:56 +00007422PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007423PyUnicode_DecodeCodePageStateful(int code_page,
7424 const char *s,
7425 Py_ssize_t size,
7426 const char *errors,
7427 Py_ssize_t *consumed)
7428{
7429 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7430}
7431
7432PyObject *
7433PyUnicode_DecodeMBCSStateful(const char *s,
7434 Py_ssize_t size,
7435 const char *errors,
7436 Py_ssize_t *consumed)
7437{
7438 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7439}
7440
7441PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007442PyUnicode_DecodeMBCS(const char *s,
7443 Py_ssize_t size,
7444 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007445{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007446 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7447}
7448
Victor Stinner3a50e702011-10-18 21:21:00 +02007449static DWORD
7450encode_code_page_flags(UINT code_page, const char *errors)
7451{
7452 if (code_page == CP_UTF8) {
7453 if (winver.dwMajorVersion >= 6)
7454 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7455 and later */
7456 return WC_ERR_INVALID_CHARS;
7457 else
7458 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7459 return 0;
7460 }
7461 else if (code_page == CP_UTF7) {
7462 /* CP_UTF7 only supports flags=0 */
7463 return 0;
7464 }
7465 else {
7466 if (errors != NULL && strcmp(errors, "replace") == 0)
7467 return 0;
7468 else
7469 return WC_NO_BEST_FIT_CHARS;
7470 }
7471}
7472
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007473/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007474 * Encode a Unicode string to a Windows code page into a byte string in strict
7475 * mode.
7476 *
7477 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7478 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007479 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007480static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007481encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007482 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007483 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007484{
Victor Stinner554f3f02010-06-16 23:33:54 +00007485 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007486 BOOL *pusedDefaultChar = &usedDefaultChar;
7487 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007488 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007489 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007490 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007491 const DWORD flags = encode_code_page_flags(code_page, NULL);
7492 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007493 /* Create a substring so that we can get the UTF-16 representation
7494 of just the slice under consideration. */
7495 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007496
Martin v. Löwis3d325192011-11-04 18:23:06 +01007497 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007498
Victor Stinner3a50e702011-10-18 21:21:00 +02007499 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007500 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007501 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007502 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007503
Victor Stinner2fc507f2011-11-04 20:06:39 +01007504 substring = PyUnicode_Substring(unicode, offset, offset+len);
7505 if (substring == NULL)
7506 return -1;
7507 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7508 if (p == NULL) {
7509 Py_DECREF(substring);
7510 return -1;
7511 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01007512
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007513 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007514 outsize = WideCharToMultiByte(code_page, flags,
7515 p, size,
7516 NULL, 0,
7517 NULL, pusedDefaultChar);
7518 if (outsize <= 0)
7519 goto error;
7520 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007521 if (pusedDefaultChar && *pusedDefaultChar) {
7522 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007523 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007524 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007525
Victor Stinner3a50e702011-10-18 21:21:00 +02007526 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007527 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007528 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007529 if (*outbytes == NULL) {
7530 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007531 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007532 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007533 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007534 }
7535 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007536 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007537 const Py_ssize_t n = PyBytes_Size(*outbytes);
7538 if (outsize > PY_SSIZE_T_MAX - n) {
7539 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007540 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007541 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007542 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007543 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7544 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007545 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007546 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007547 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007548 }
7549
7550 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007551 outsize = WideCharToMultiByte(code_page, flags,
7552 p, size,
7553 out, outsize,
7554 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007555 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007556 if (outsize <= 0)
7557 goto error;
7558 if (pusedDefaultChar && *pusedDefaultChar)
7559 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007560 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007561
Victor Stinner3a50e702011-10-18 21:21:00 +02007562error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007563 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007564 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7565 return -2;
7566 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007567 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007568}
7569
Victor Stinner3a50e702011-10-18 21:21:00 +02007570/*
7571 * Encode a Unicode string to a Windows code page into a byte string using a
7572 * error handler.
7573 *
7574 * Returns consumed characters if succeed, or raise a WindowsError and returns
7575 * -1 on other error.
7576 */
7577static int
7578encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007579 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007580 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007581{
Victor Stinner3a50e702011-10-18 21:21:00 +02007582 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007583 Py_ssize_t pos = unicode_offset;
7584 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007585 /* Ideally, we should get reason from FormatMessage. This is the Windows
7586 2000 English version of the message. */
7587 const char *reason = "invalid character";
7588 /* 4=maximum length of a UTF-8 sequence */
7589 char buffer[4];
7590 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7591 Py_ssize_t outsize;
7592 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007593 PyObject *errorHandler = NULL;
7594 PyObject *exc = NULL;
7595 PyObject *encoding_obj = NULL;
7596 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007597 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007598 PyObject *rep;
7599 int ret = -1;
7600
7601 assert(insize > 0);
7602
7603 encoding = code_page_name(code_page, &encoding_obj);
7604 if (encoding == NULL)
7605 return -1;
7606
7607 if (errors == NULL || strcmp(errors, "strict") == 0) {
7608 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7609 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007610 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007611 if (exc != NULL) {
7612 PyCodec_StrictErrors(exc);
7613 Py_DECREF(exc);
7614 }
7615 Py_XDECREF(encoding_obj);
7616 return -1;
7617 }
7618
7619 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7620 pusedDefaultChar = &usedDefaultChar;
7621 else
7622 pusedDefaultChar = NULL;
7623
7624 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7625 PyErr_NoMemory();
7626 goto error;
7627 }
7628 outsize = insize * Py_ARRAY_LENGTH(buffer);
7629
7630 if (*outbytes == NULL) {
7631 /* Create string object */
7632 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7633 if (*outbytes == NULL)
7634 goto error;
7635 out = PyBytes_AS_STRING(*outbytes);
7636 }
7637 else {
7638 /* Extend string object */
7639 Py_ssize_t n = PyBytes_Size(*outbytes);
7640 if (n > PY_SSIZE_T_MAX - outsize) {
7641 PyErr_NoMemory();
7642 goto error;
7643 }
7644 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7645 goto error;
7646 out = PyBytes_AS_STRING(*outbytes) + n;
7647 }
7648
7649 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007650 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007651 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007652 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7653 wchar_t chars[2];
7654 int charsize;
7655 if (ch < 0x10000) {
7656 chars[0] = (wchar_t)ch;
7657 charsize = 1;
7658 }
7659 else {
7660 ch -= 0x10000;
7661 chars[0] = 0xd800 + (ch >> 10);
7662 chars[1] = 0xdc00 + (ch & 0x3ff);
7663 charsize = 2;
7664 }
7665
Victor Stinner3a50e702011-10-18 21:21:00 +02007666 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007667 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007668 buffer, Py_ARRAY_LENGTH(buffer),
7669 NULL, pusedDefaultChar);
7670 if (outsize > 0) {
7671 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7672 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007673 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007674 memcpy(out, buffer, outsize);
7675 out += outsize;
7676 continue;
7677 }
7678 }
7679 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7680 PyErr_SetFromWindowsErr(0);
7681 goto error;
7682 }
7683
Victor Stinner3a50e702011-10-18 21:21:00 +02007684 rep = unicode_encode_call_errorhandler(
7685 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007686 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007687 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007688 if (rep == NULL)
7689 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007690 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007691
7692 if (PyBytes_Check(rep)) {
7693 outsize = PyBytes_GET_SIZE(rep);
7694 if (outsize != 1) {
7695 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7696 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7697 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7698 Py_DECREF(rep);
7699 goto error;
7700 }
7701 out = PyBytes_AS_STRING(*outbytes) + offset;
7702 }
7703 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7704 out += outsize;
7705 }
7706 else {
7707 Py_ssize_t i;
7708 enum PyUnicode_Kind kind;
7709 void *data;
7710
Benjamin Petersonbac79492012-01-14 13:34:47 -05007711 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007712 Py_DECREF(rep);
7713 goto error;
7714 }
7715
7716 outsize = PyUnicode_GET_LENGTH(rep);
7717 if (outsize != 1) {
7718 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7719 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7720 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7721 Py_DECREF(rep);
7722 goto error;
7723 }
7724 out = PyBytes_AS_STRING(*outbytes) + offset;
7725 }
7726 kind = PyUnicode_KIND(rep);
7727 data = PyUnicode_DATA(rep);
7728 for (i=0; i < outsize; i++) {
7729 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7730 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007731 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007732 encoding, unicode,
7733 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007734 "unable to encode error handler result to ASCII");
7735 Py_DECREF(rep);
7736 goto error;
7737 }
7738 *out = (unsigned char)ch;
7739 out++;
7740 }
7741 }
7742 Py_DECREF(rep);
7743 }
7744 /* write a NUL byte */
7745 *out = 0;
7746 outsize = out - PyBytes_AS_STRING(*outbytes);
7747 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7748 if (_PyBytes_Resize(outbytes, outsize) < 0)
7749 goto error;
7750 ret = 0;
7751
7752error:
7753 Py_XDECREF(encoding_obj);
7754 Py_XDECREF(errorHandler);
7755 Py_XDECREF(exc);
7756 return ret;
7757}
7758
Victor Stinner3a50e702011-10-18 21:21:00 +02007759static PyObject *
7760encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007761 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007762 const char *errors)
7763{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007764 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007765 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007766 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007767 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007768
Benjamin Petersonbac79492012-01-14 13:34:47 -05007769 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007770 return NULL;
7771 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007772
Victor Stinner3a50e702011-10-18 21:21:00 +02007773 if (code_page < 0) {
7774 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7775 return NULL;
7776 }
7777
Martin v. Löwis3d325192011-11-04 18:23:06 +01007778 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007779 return PyBytes_FromStringAndSize(NULL, 0);
7780
Victor Stinner7581cef2011-11-03 22:32:33 +01007781 offset = 0;
7782 do
7783 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007784#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007785 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007786 chunks. */
7787 if (len > INT_MAX/2) {
7788 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007789 done = 0;
7790 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007791 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007792#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007793 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007794 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007795 done = 1;
7796 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007797
Victor Stinner76a31a62011-11-04 00:05:13 +01007798 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007799 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007800 errors);
7801 if (ret == -2)
7802 ret = encode_code_page_errors(code_page, &outbytes,
7803 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007804 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007805 if (ret < 0) {
7806 Py_XDECREF(outbytes);
7807 return NULL;
7808 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007809
Victor Stinner7581cef2011-11-03 22:32:33 +01007810 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007811 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007812 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007813
Victor Stinner3a50e702011-10-18 21:21:00 +02007814 return outbytes;
7815}
7816
7817PyObject *
7818PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7819 Py_ssize_t size,
7820 const char *errors)
7821{
Victor Stinner7581cef2011-11-03 22:32:33 +01007822 PyObject *unicode, *res;
7823 unicode = PyUnicode_FromUnicode(p, size);
7824 if (unicode == NULL)
7825 return NULL;
7826 res = encode_code_page(CP_ACP, unicode, errors);
7827 Py_DECREF(unicode);
7828 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007829}
7830
7831PyObject *
7832PyUnicode_EncodeCodePage(int code_page,
7833 PyObject *unicode,
7834 const char *errors)
7835{
Victor Stinner7581cef2011-11-03 22:32:33 +01007836 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007837}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007838
Alexander Belopolsky40018472011-02-26 01:02:56 +00007839PyObject *
7840PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007841{
7842 if (!PyUnicode_Check(unicode)) {
7843 PyErr_BadArgument();
7844 return NULL;
7845 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007846 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007847}
7848
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007849#undef NEED_RETRY
7850
Victor Stinner99b95382011-07-04 14:23:54 +02007851#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007852
Guido van Rossumd57fd912000-03-10 22:53:23 +00007853/* --- Character Mapping Codec -------------------------------------------- */
7854
Alexander Belopolsky40018472011-02-26 01:02:56 +00007855PyObject *
7856PyUnicode_DecodeCharmap(const char *s,
7857 Py_ssize_t size,
7858 PyObject *mapping,
7859 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007860{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007861 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007862 Py_ssize_t startinpos;
7863 Py_ssize_t endinpos;
7864 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007865 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007866 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007867 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007868 PyObject *errorHandler = NULL;
7869 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007870
Guido van Rossumd57fd912000-03-10 22:53:23 +00007871 /* Default to Latin-1 */
7872 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007873 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007874
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007875 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007876 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007877 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007878 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007879 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007880 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007881 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007882 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007883 Py_ssize_t maplen;
7884 enum PyUnicode_Kind kind;
7885 void *data;
7886 Py_UCS4 x;
7887
Benjamin Petersonbac79492012-01-14 13:34:47 -05007888 if (PyUnicode_READY(mapping) == -1)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007889 return NULL;
7890
7891 maplen = PyUnicode_GET_LENGTH(mapping);
7892 data = PyUnicode_DATA(mapping);
7893 kind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007894 while (s < e) {
7895 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007896
Benjamin Peterson29060642009-01-31 22:14:21 +00007897 if (ch < maplen)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007898 x = PyUnicode_READ(kind, data, ch);
7899 else
7900 x = 0xfffe; /* invalid value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007901
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007902 if (x == 0xfffe)
7903 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007904 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007905 startinpos = s-starts;
7906 endinpos = startinpos+1;
7907 if (unicode_decode_call_errorhandler(
7908 errors, &errorHandler,
7909 "charmap", "character maps to <undefined>",
7910 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007911 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007912 goto onError;
7913 }
7914 continue;
7915 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007916
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007917 if (unicode_putchar(&v, &outpos, x) < 0)
7918 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007919 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007920 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007921 }
7922 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007923 while (s < e) {
7924 unsigned char ch = *s;
7925 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007926
Benjamin Peterson29060642009-01-31 22:14:21 +00007927 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7928 w = PyLong_FromLong((long)ch);
7929 if (w == NULL)
7930 goto onError;
7931 x = PyObject_GetItem(mapping, w);
7932 Py_DECREF(w);
7933 if (x == NULL) {
7934 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7935 /* No mapping found means: mapping is undefined. */
7936 PyErr_Clear();
7937 x = Py_None;
7938 Py_INCREF(x);
7939 } else
7940 goto onError;
7941 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007942
Benjamin Peterson29060642009-01-31 22:14:21 +00007943 /* Apply mapping */
7944 if (PyLong_Check(x)) {
7945 long value = PyLong_AS_LONG(x);
7946 if (value < 0 || value > 65535) {
7947 PyErr_SetString(PyExc_TypeError,
7948 "character mapping must be in range(65536)");
7949 Py_DECREF(x);
7950 goto onError;
7951 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007952 if (unicode_putchar(&v, &outpos, value) < 0)
7953 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007954 }
7955 else if (x == Py_None) {
7956 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007957 startinpos = s-starts;
7958 endinpos = startinpos+1;
7959 if (unicode_decode_call_errorhandler(
7960 errors, &errorHandler,
7961 "charmap", "character maps to <undefined>",
7962 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007963 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007964 Py_DECREF(x);
7965 goto onError;
7966 }
7967 Py_DECREF(x);
7968 continue;
7969 }
7970 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007971 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007972
Benjamin Petersonbac79492012-01-14 13:34:47 -05007973 if (PyUnicode_READY(x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007974 goto onError;
7975 targetsize = PyUnicode_GET_LENGTH(x);
7976
7977 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007978 /* 1-1 mapping */
Victor Stinner62aa4d02011-11-09 00:03:45 +01007979 if (unicode_putchar(&v, &outpos,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007980 PyUnicode_READ_CHAR(x, 0)) < 0)
7981 goto onError;
7982 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007983 else if (targetsize > 1) {
7984 /* 1-n mapping */
7985 if (targetsize > extrachars) {
7986 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007987 Py_ssize_t needed = (targetsize - extrachars) + \
7988 (targetsize << 2);
7989 extrachars += needed;
7990 /* XXX overflow detection missing */
Victor Stinner16e6a802011-12-12 13:24:15 +01007991 if (unicode_resize(&v,
7992 PyUnicode_GET_LENGTH(v) + needed) < 0)
7993 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007994 Py_DECREF(x);
7995 goto onError;
7996 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007997 }
Victor Stinner1b487b42012-05-03 12:29:04 +02007998 if (unicode_widen(&v, outpos, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007999 goto onError;
8000 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
8001 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00008002 extrachars -= targetsize;
8003 }
8004 /* 1-0 mapping: skip the character */
8005 }
8006 else {
8007 /* wrong return value */
8008 PyErr_SetString(PyExc_TypeError,
8009 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008010 Py_DECREF(x);
8011 goto onError;
8012 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008013 Py_DECREF(x);
8014 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008015 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008016 }
Victor Stinner16e6a802011-12-12 13:24:15 +01008017 if (unicode_resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01008018 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008019 Py_XDECREF(errorHandler);
8020 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008021 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00008022
Benjamin Peterson29060642009-01-31 22:14:21 +00008023 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008024 Py_XDECREF(errorHandler);
8025 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008026 Py_XDECREF(v);
8027 return NULL;
8028}
8029
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008030/* Charmap encoding: the lookup table */
8031
Alexander Belopolsky40018472011-02-26 01:02:56 +00008032struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008033 PyObject_HEAD
8034 unsigned char level1[32];
8035 int count2, count3;
8036 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008037};
8038
8039static PyObject*
8040encoding_map_size(PyObject *obj, PyObject* args)
8041{
8042 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008043 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008044 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008045}
8046
8047static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008048 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008049 PyDoc_STR("Return the size (in bytes) of this object") },
8050 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008051};
8052
8053static void
8054encoding_map_dealloc(PyObject* o)
8055{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008056 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008057}
8058
8059static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008060 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008061 "EncodingMap", /*tp_name*/
8062 sizeof(struct encoding_map), /*tp_basicsize*/
8063 0, /*tp_itemsize*/
8064 /* methods */
8065 encoding_map_dealloc, /*tp_dealloc*/
8066 0, /*tp_print*/
8067 0, /*tp_getattr*/
8068 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00008069 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008070 0, /*tp_repr*/
8071 0, /*tp_as_number*/
8072 0, /*tp_as_sequence*/
8073 0, /*tp_as_mapping*/
8074 0, /*tp_hash*/
8075 0, /*tp_call*/
8076 0, /*tp_str*/
8077 0, /*tp_getattro*/
8078 0, /*tp_setattro*/
8079 0, /*tp_as_buffer*/
8080 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8081 0, /*tp_doc*/
8082 0, /*tp_traverse*/
8083 0, /*tp_clear*/
8084 0, /*tp_richcompare*/
8085 0, /*tp_weaklistoffset*/
8086 0, /*tp_iter*/
8087 0, /*tp_iternext*/
8088 encoding_map_methods, /*tp_methods*/
8089 0, /*tp_members*/
8090 0, /*tp_getset*/
8091 0, /*tp_base*/
8092 0, /*tp_dict*/
8093 0, /*tp_descr_get*/
8094 0, /*tp_descr_set*/
8095 0, /*tp_dictoffset*/
8096 0, /*tp_init*/
8097 0, /*tp_alloc*/
8098 0, /*tp_new*/
8099 0, /*tp_free*/
8100 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008101};
8102
8103PyObject*
8104PyUnicode_BuildEncodingMap(PyObject* string)
8105{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008106 PyObject *result;
8107 struct encoding_map *mresult;
8108 int i;
8109 int need_dict = 0;
8110 unsigned char level1[32];
8111 unsigned char level2[512];
8112 unsigned char *mlevel1, *mlevel2, *mlevel3;
8113 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008114 int kind;
8115 void *data;
8116 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008117
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008118 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008119 PyErr_BadArgument();
8120 return NULL;
8121 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008122 kind = PyUnicode_KIND(string);
8123 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008124 memset(level1, 0xFF, sizeof level1);
8125 memset(level2, 0xFF, sizeof level2);
8126
8127 /* If there isn't a one-to-one mapping of NULL to \0,
8128 or if there are non-BMP characters, we need to use
8129 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008130 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008131 need_dict = 1;
8132 for (i = 1; i < 256; i++) {
8133 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008134 ch = PyUnicode_READ(kind, data, i);
8135 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008136 need_dict = 1;
8137 break;
8138 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008139 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008140 /* unmapped character */
8141 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008142 l1 = ch >> 11;
8143 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008144 if (level1[l1] == 0xFF)
8145 level1[l1] = count2++;
8146 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008147 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008148 }
8149
8150 if (count2 >= 0xFF || count3 >= 0xFF)
8151 need_dict = 1;
8152
8153 if (need_dict) {
8154 PyObject *result = PyDict_New();
8155 PyObject *key, *value;
8156 if (!result)
8157 return NULL;
8158 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008159 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008160 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008161 if (!key || !value)
8162 goto failed1;
8163 if (PyDict_SetItem(result, key, value) == -1)
8164 goto failed1;
8165 Py_DECREF(key);
8166 Py_DECREF(value);
8167 }
8168 return result;
8169 failed1:
8170 Py_XDECREF(key);
8171 Py_XDECREF(value);
8172 Py_DECREF(result);
8173 return NULL;
8174 }
8175
8176 /* Create a three-level trie */
8177 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8178 16*count2 + 128*count3 - 1);
8179 if (!result)
8180 return PyErr_NoMemory();
8181 PyObject_Init(result, &EncodingMapType);
8182 mresult = (struct encoding_map*)result;
8183 mresult->count2 = count2;
8184 mresult->count3 = count3;
8185 mlevel1 = mresult->level1;
8186 mlevel2 = mresult->level23;
8187 mlevel3 = mresult->level23 + 16*count2;
8188 memcpy(mlevel1, level1, 32);
8189 memset(mlevel2, 0xFF, 16*count2);
8190 memset(mlevel3, 0, 128*count3);
8191 count3 = 0;
8192 for (i = 1; i < 256; i++) {
8193 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008194 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008195 /* unmapped character */
8196 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008197 o1 = PyUnicode_READ(kind, data, i)>>11;
8198 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008199 i2 = 16*mlevel1[o1] + o2;
8200 if (mlevel2[i2] == 0xFF)
8201 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008202 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008203 i3 = 128*mlevel2[i2] + o3;
8204 mlevel3[i3] = i;
8205 }
8206 return result;
8207}
8208
8209static int
Victor Stinner22168992011-11-20 17:09:18 +01008210encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008211{
8212 struct encoding_map *map = (struct encoding_map*)mapping;
8213 int l1 = c>>11;
8214 int l2 = (c>>7) & 0xF;
8215 int l3 = c & 0x7F;
8216 int i;
8217
Victor Stinner22168992011-11-20 17:09:18 +01008218 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008219 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008220 if (c == 0)
8221 return 0;
8222 /* level 1*/
8223 i = map->level1[l1];
8224 if (i == 0xFF) {
8225 return -1;
8226 }
8227 /* level 2*/
8228 i = map->level23[16*i+l2];
8229 if (i == 0xFF) {
8230 return -1;
8231 }
8232 /* level 3 */
8233 i = map->level23[16*map->count2 + 128*i + l3];
8234 if (i == 0) {
8235 return -1;
8236 }
8237 return i;
8238}
8239
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008240/* Lookup the character ch in the mapping. If the character
8241 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008242 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008243static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008244charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008245{
Christian Heimes217cfd12007-12-02 14:31:20 +00008246 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008247 PyObject *x;
8248
8249 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008250 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008251 x = PyObject_GetItem(mapping, w);
8252 Py_DECREF(w);
8253 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008254 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8255 /* No mapping found means: mapping is undefined. */
8256 PyErr_Clear();
8257 x = Py_None;
8258 Py_INCREF(x);
8259 return x;
8260 } else
8261 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008262 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008263 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008264 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008265 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008266 long value = PyLong_AS_LONG(x);
8267 if (value < 0 || value > 255) {
8268 PyErr_SetString(PyExc_TypeError,
8269 "character mapping must be in range(256)");
8270 Py_DECREF(x);
8271 return NULL;
8272 }
8273 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008274 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008275 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008276 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008277 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008278 /* wrong return value */
8279 PyErr_Format(PyExc_TypeError,
8280 "character mapping must return integer, bytes or None, not %.400s",
8281 x->ob_type->tp_name);
8282 Py_DECREF(x);
8283 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008284 }
8285}
8286
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008287static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008288charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008289{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008290 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8291 /* exponentially overallocate to minimize reallocations */
8292 if (requiredsize < 2*outsize)
8293 requiredsize = 2*outsize;
8294 if (_PyBytes_Resize(outobj, requiredsize))
8295 return -1;
8296 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008297}
8298
Benjamin Peterson14339b62009-01-31 16:36:08 +00008299typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008300 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008301} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008302/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008303 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008304 space is available. Return a new reference to the object that
8305 was put in the output buffer, or Py_None, if the mapping was undefined
8306 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008307 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008308static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008309charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008310 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008311{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008312 PyObject *rep;
8313 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008314 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008315
Christian Heimes90aa7642007-12-19 02:45:37 +00008316 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008317 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008318 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008319 if (res == -1)
8320 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008321 if (outsize<requiredsize)
8322 if (charmapencode_resize(outobj, outpos, requiredsize))
8323 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008324 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008325 outstart[(*outpos)++] = (char)res;
8326 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008327 }
8328
8329 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008330 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008331 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008332 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008333 Py_DECREF(rep);
8334 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008335 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008336 if (PyLong_Check(rep)) {
8337 Py_ssize_t requiredsize = *outpos+1;
8338 if (outsize<requiredsize)
8339 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8340 Py_DECREF(rep);
8341 return enc_EXCEPTION;
8342 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008343 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008344 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008345 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008346 else {
8347 const char *repchars = PyBytes_AS_STRING(rep);
8348 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8349 Py_ssize_t requiredsize = *outpos+repsize;
8350 if (outsize<requiredsize)
8351 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8352 Py_DECREF(rep);
8353 return enc_EXCEPTION;
8354 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008355 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008356 memcpy(outstart + *outpos, repchars, repsize);
8357 *outpos += repsize;
8358 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008359 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008360 Py_DECREF(rep);
8361 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008362}
8363
8364/* handle an error in PyUnicode_EncodeCharmap
8365 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008366static int
8367charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008368 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008369 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008370 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008371 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008372{
8373 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008374 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008375 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008376 enum PyUnicode_Kind kind;
8377 void *data;
8378 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008379 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008380 Py_ssize_t collstartpos = *inpos;
8381 Py_ssize_t collendpos = *inpos+1;
8382 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008383 char *encoding = "charmap";
8384 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008385 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008386 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008387 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008388
Benjamin Petersonbac79492012-01-14 13:34:47 -05008389 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008390 return -1;
8391 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008392 /* find all unencodable characters */
8393 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008394 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008395 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008396 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008397 val = encoding_map_lookup(ch, mapping);
8398 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008399 break;
8400 ++collendpos;
8401 continue;
8402 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008403
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008404 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8405 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008406 if (rep==NULL)
8407 return -1;
8408 else if (rep!=Py_None) {
8409 Py_DECREF(rep);
8410 break;
8411 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008412 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008413 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008414 }
8415 /* cache callback name lookup
8416 * (if not done yet, i.e. it's the first error) */
8417 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008418 if ((errors==NULL) || (!strcmp(errors, "strict")))
8419 *known_errorHandler = 1;
8420 else if (!strcmp(errors, "replace"))
8421 *known_errorHandler = 2;
8422 else if (!strcmp(errors, "ignore"))
8423 *known_errorHandler = 3;
8424 else if (!strcmp(errors, "xmlcharrefreplace"))
8425 *known_errorHandler = 4;
8426 else
8427 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008428 }
8429 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008430 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008431 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008432 return -1;
8433 case 2: /* replace */
8434 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008435 x = charmapencode_output('?', mapping, res, respos);
8436 if (x==enc_EXCEPTION) {
8437 return -1;
8438 }
8439 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008440 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008441 return -1;
8442 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008443 }
8444 /* fall through */
8445 case 3: /* ignore */
8446 *inpos = collendpos;
8447 break;
8448 case 4: /* xmlcharrefreplace */
8449 /* generate replacement (temporarily (mis)uses p) */
8450 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008451 char buffer[2+29+1+1];
8452 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008453 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008454 for (cp = buffer; *cp; ++cp) {
8455 x = charmapencode_output(*cp, mapping, res, respos);
8456 if (x==enc_EXCEPTION)
8457 return -1;
8458 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008459 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008460 return -1;
8461 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008462 }
8463 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008464 *inpos = collendpos;
8465 break;
8466 default:
8467 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008468 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008469 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008470 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008471 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008472 if (PyBytes_Check(repunicode)) {
8473 /* Directly copy bytes result to output. */
8474 Py_ssize_t outsize = PyBytes_Size(*res);
8475 Py_ssize_t requiredsize;
8476 repsize = PyBytes_Size(repunicode);
8477 requiredsize = *respos + repsize;
8478 if (requiredsize > outsize)
8479 /* Make room for all additional bytes. */
8480 if (charmapencode_resize(res, respos, requiredsize)) {
8481 Py_DECREF(repunicode);
8482 return -1;
8483 }
8484 memcpy(PyBytes_AsString(*res) + *respos,
8485 PyBytes_AsString(repunicode), repsize);
8486 *respos += repsize;
8487 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008488 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008489 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008490 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008491 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008492 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008493 Py_DECREF(repunicode);
8494 return -1;
8495 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008496 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008497 data = PyUnicode_DATA(repunicode);
8498 kind = PyUnicode_KIND(repunicode);
8499 for (index = 0; index < repsize; index++) {
8500 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8501 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008502 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008503 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008504 return -1;
8505 }
8506 else if (x==enc_FAILED) {
8507 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008508 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008509 return -1;
8510 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008511 }
8512 *inpos = newpos;
8513 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008514 }
8515 return 0;
8516}
8517
Alexander Belopolsky40018472011-02-26 01:02:56 +00008518PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008519_PyUnicode_EncodeCharmap(PyObject *unicode,
8520 PyObject *mapping,
8521 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008522{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008523 /* output object */
8524 PyObject *res = NULL;
8525 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008526 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008527 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008528 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008529 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008530 PyObject *errorHandler = NULL;
8531 PyObject *exc = NULL;
8532 /* the following variable is used for caching string comparisons
8533 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8534 * 3=ignore, 4=xmlcharrefreplace */
8535 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008536
Benjamin Petersonbac79492012-01-14 13:34:47 -05008537 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008538 return NULL;
8539 size = PyUnicode_GET_LENGTH(unicode);
8540
Guido van Rossumd57fd912000-03-10 22:53:23 +00008541 /* Default to Latin-1 */
8542 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008543 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008544
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008545 /* allocate enough for a simple encoding without
8546 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008547 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008548 if (res == NULL)
8549 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008550 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008551 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008552
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008553 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008554 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008555 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008556 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008557 if (x==enc_EXCEPTION) /* error */
8558 goto onError;
8559 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008560 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008561 &exc,
8562 &known_errorHandler, &errorHandler, errors,
8563 &res, &respos)) {
8564 goto onError;
8565 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008566 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008567 else
8568 /* done with this character => adjust input position */
8569 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008570 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008571
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008572 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008573 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008574 if (_PyBytes_Resize(&res, respos) < 0)
8575 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008576
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008577 Py_XDECREF(exc);
8578 Py_XDECREF(errorHandler);
8579 return res;
8580
Benjamin Peterson29060642009-01-31 22:14:21 +00008581 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008582 Py_XDECREF(res);
8583 Py_XDECREF(exc);
8584 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008585 return NULL;
8586}
8587
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008588/* Deprecated */
8589PyObject *
8590PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8591 Py_ssize_t size,
8592 PyObject *mapping,
8593 const char *errors)
8594{
8595 PyObject *result;
8596 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8597 if (unicode == NULL)
8598 return NULL;
8599 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8600 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008601 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008602}
8603
Alexander Belopolsky40018472011-02-26 01:02:56 +00008604PyObject *
8605PyUnicode_AsCharmapString(PyObject *unicode,
8606 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008607{
8608 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008609 PyErr_BadArgument();
8610 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008611 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008612 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008613}
8614
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008615/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008616static void
8617make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008618 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008619 Py_ssize_t startpos, Py_ssize_t endpos,
8620 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008621{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008622 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008623 *exceptionObject = _PyUnicodeTranslateError_Create(
8624 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008625 }
8626 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008627 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8628 goto onError;
8629 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8630 goto onError;
8631 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8632 goto onError;
8633 return;
8634 onError:
8635 Py_DECREF(*exceptionObject);
8636 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008637 }
8638}
8639
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008640/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008641static void
8642raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008643 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008644 Py_ssize_t startpos, Py_ssize_t endpos,
8645 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008646{
8647 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008648 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008649 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008650 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008651}
8652
8653/* error handling callback helper:
8654 build arguments, call the callback and check the arguments,
8655 put the result into newpos and return the replacement string, which
8656 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008657static PyObject *
8658unicode_translate_call_errorhandler(const char *errors,
8659 PyObject **errorHandler,
8660 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008661 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008662 Py_ssize_t startpos, Py_ssize_t endpos,
8663 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008664{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008665 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008666
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008667 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008668 PyObject *restuple;
8669 PyObject *resunicode;
8670
8671 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008672 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008673 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008674 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008675 }
8676
8677 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008678 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008679 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008680 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008681
8682 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008683 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008684 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008685 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008686 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008687 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008688 Py_DECREF(restuple);
8689 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008690 }
8691 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008692 &resunicode, &i_newpos)) {
8693 Py_DECREF(restuple);
8694 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008695 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008696 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008697 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008698 else
8699 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008700 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008701 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8702 Py_DECREF(restuple);
8703 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008704 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008705 Py_INCREF(resunicode);
8706 Py_DECREF(restuple);
8707 return resunicode;
8708}
8709
8710/* Lookup the character ch in the mapping and put the result in result,
8711 which must be decrefed by the caller.
8712 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008713static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008714charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008715{
Christian Heimes217cfd12007-12-02 14:31:20 +00008716 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008717 PyObject *x;
8718
8719 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008720 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008721 x = PyObject_GetItem(mapping, w);
8722 Py_DECREF(w);
8723 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008724 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8725 /* No mapping found means: use 1:1 mapping. */
8726 PyErr_Clear();
8727 *result = NULL;
8728 return 0;
8729 } else
8730 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008731 }
8732 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008733 *result = x;
8734 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008735 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008736 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008737 long value = PyLong_AS_LONG(x);
8738 long max = PyUnicode_GetMax();
8739 if (value < 0 || value > max) {
8740 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008741 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008742 Py_DECREF(x);
8743 return -1;
8744 }
8745 *result = x;
8746 return 0;
8747 }
8748 else if (PyUnicode_Check(x)) {
8749 *result = x;
8750 return 0;
8751 }
8752 else {
8753 /* wrong return value */
8754 PyErr_SetString(PyExc_TypeError,
8755 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008756 Py_DECREF(x);
8757 return -1;
8758 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008759}
8760/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008761 if not reallocate and adjust various state variables.
8762 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008763static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008764charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008765 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008766{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008767 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00008768 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008769 /* exponentially overallocate to minimize reallocations */
8770 if (requiredsize < 2 * oldsize)
8771 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008772 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8773 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008774 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008775 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008776 }
8777 return 0;
8778}
8779/* lookup the character, put the result in the output string and adjust
8780 various state variables. Return a new reference to the object that
8781 was put in the output buffer in *result, or Py_None, if the mapping was
8782 undefined (in which case no character was written).
8783 The called must decref result.
8784 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008785static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008786charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8787 PyObject *mapping, Py_UCS4 **output,
8788 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008789 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008790{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008791 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8792 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008793 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008794 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008795 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008796 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008797 }
8798 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008799 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008800 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008801 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008802 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008803 }
8804 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008805 Py_ssize_t repsize;
8806 if (PyUnicode_READY(*res) == -1)
8807 return -1;
8808 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008809 if (repsize==1) {
8810 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008811 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008812 }
8813 else if (repsize!=0) {
8814 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008815 Py_ssize_t requiredsize = *opos +
8816 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008817 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008818 Py_ssize_t i;
8819 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008820 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008821 for(i = 0; i < repsize; i++)
8822 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008823 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008824 }
8825 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008826 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008827 return 0;
8828}
8829
Alexander Belopolsky40018472011-02-26 01:02:56 +00008830PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008831_PyUnicode_TranslateCharmap(PyObject *input,
8832 PyObject *mapping,
8833 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008834{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008835 /* input object */
8836 char *idata;
8837 Py_ssize_t size, i;
8838 int kind;
8839 /* output buffer */
8840 Py_UCS4 *output = NULL;
8841 Py_ssize_t osize;
8842 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008843 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008844 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008845 char *reason = "character maps to <undefined>";
8846 PyObject *errorHandler = NULL;
8847 PyObject *exc = NULL;
8848 /* the following variable is used for caching string comparisons
8849 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8850 * 3=ignore, 4=xmlcharrefreplace */
8851 int known_errorHandler = -1;
8852
Guido van Rossumd57fd912000-03-10 22:53:23 +00008853 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008854 PyErr_BadArgument();
8855 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008856 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008857
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008858 if (PyUnicode_READY(input) == -1)
8859 return NULL;
8860 idata = (char*)PyUnicode_DATA(input);
8861 kind = PyUnicode_KIND(input);
8862 size = PyUnicode_GET_LENGTH(input);
8863 i = 0;
8864
8865 if (size == 0) {
8866 Py_INCREF(input);
8867 return input;
8868 }
8869
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008870 /* allocate enough for a simple 1:1 translation without
8871 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008872 osize = size;
8873 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8874 opos = 0;
8875 if (output == NULL) {
8876 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008877 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008878 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008879
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008880 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008881 /* try to encode it */
8882 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008883 if (charmaptranslate_output(input, i, mapping,
8884 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008885 Py_XDECREF(x);
8886 goto onError;
8887 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008888 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008889 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008890 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008891 else { /* untranslatable character */
8892 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8893 Py_ssize_t repsize;
8894 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008895 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008896 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008897 Py_ssize_t collstart = i;
8898 Py_ssize_t collend = i+1;
8899 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008900
Benjamin Peterson29060642009-01-31 22:14:21 +00008901 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008902 while (collend < size) {
8903 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008904 goto onError;
8905 Py_XDECREF(x);
8906 if (x!=Py_None)
8907 break;
8908 ++collend;
8909 }
8910 /* cache callback name lookup
8911 * (if not done yet, i.e. it's the first error) */
8912 if (known_errorHandler==-1) {
8913 if ((errors==NULL) || (!strcmp(errors, "strict")))
8914 known_errorHandler = 1;
8915 else if (!strcmp(errors, "replace"))
8916 known_errorHandler = 2;
8917 else if (!strcmp(errors, "ignore"))
8918 known_errorHandler = 3;
8919 else if (!strcmp(errors, "xmlcharrefreplace"))
8920 known_errorHandler = 4;
8921 else
8922 known_errorHandler = 0;
8923 }
8924 switch (known_errorHandler) {
8925 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008926 raise_translate_exception(&exc, input, collstart,
8927 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008928 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008929 case 2: /* replace */
8930 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008931 for (coll = collstart; coll<collend; coll++)
8932 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008933 /* fall through */
8934 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008935 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008936 break;
8937 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008938 /* generate replacement (temporarily (mis)uses i) */
8939 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008940 char buffer[2+29+1+1];
8941 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008942 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8943 if (charmaptranslate_makespace(&output, &osize,
8944 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008945 goto onError;
8946 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008947 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008948 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008949 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008950 break;
8951 default:
8952 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008953 reason, input, &exc,
8954 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008955 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008956 goto onError;
Benjamin Peterson9ca3ffa2012-01-01 16:04:29 -06008957 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008958 Py_DECREF(repunicode);
8959 goto onError;
8960 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008961 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008962 repsize = PyUnicode_GET_LENGTH(repunicode);
8963 if (charmaptranslate_makespace(&output, &osize,
8964 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008965 Py_DECREF(repunicode);
8966 goto onError;
8967 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008968 for (uni2 = 0; repsize-->0; ++uni2)
8969 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8970 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008971 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008972 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008973 }
8974 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008975 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8976 if (!res)
8977 goto onError;
8978 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008979 Py_XDECREF(exc);
8980 Py_XDECREF(errorHandler);
8981 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008982
Benjamin Peterson29060642009-01-31 22:14:21 +00008983 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008984 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008985 Py_XDECREF(exc);
8986 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008987 return NULL;
8988}
8989
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008990/* Deprecated. Use PyUnicode_Translate instead. */
8991PyObject *
8992PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8993 Py_ssize_t size,
8994 PyObject *mapping,
8995 const char *errors)
8996{
8997 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8998 if (!unicode)
8999 return NULL;
9000 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9001}
9002
Alexander Belopolsky40018472011-02-26 01:02:56 +00009003PyObject *
9004PyUnicode_Translate(PyObject *str,
9005 PyObject *mapping,
9006 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009007{
9008 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00009009
Guido van Rossumd57fd912000-03-10 22:53:23 +00009010 str = PyUnicode_FromObject(str);
9011 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009012 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009013 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009014 Py_DECREF(str);
9015 return result;
Tim Petersced69f82003-09-16 20:30:58 +00009016
Benjamin Peterson29060642009-01-31 22:14:21 +00009017 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009018 Py_XDECREF(str);
9019 return NULL;
9020}
Tim Petersced69f82003-09-16 20:30:58 +00009021
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009022static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009023fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009024{
9025 /* No need to call PyUnicode_READY(self) because this function is only
9026 called as a callback from fixup() which does it already. */
9027 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9028 const int kind = PyUnicode_KIND(self);
9029 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02009030 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009031 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009032 Py_ssize_t i;
9033
9034 for (i = 0; i < len; ++i) {
9035 ch = PyUnicode_READ(kind, data, i);
9036 fixed = 0;
9037 if (ch > 127) {
9038 if (Py_UNICODE_ISSPACE(ch))
9039 fixed = ' ';
9040 else {
9041 const int decimal = Py_UNICODE_TODECIMAL(ch);
9042 if (decimal >= 0)
9043 fixed = '0' + decimal;
9044 }
9045 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009046 modified = 1;
Victor Stinnere6abb482012-05-02 01:15:40 +02009047 maxchar = MAX_MAXCHAR(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009048 PyUnicode_WRITE(kind, data, i, fixed);
9049 }
Victor Stinnere6abb482012-05-02 01:15:40 +02009050 else
9051 maxchar = MAX_MAXCHAR(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009052 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009053 }
9054
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009055 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009056}
9057
9058PyObject *
9059_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9060{
9061 if (!PyUnicode_Check(unicode)) {
9062 PyErr_BadInternalCall();
9063 return NULL;
9064 }
9065 if (PyUnicode_READY(unicode) == -1)
9066 return NULL;
9067 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
9068 /* If the string is already ASCII, just return the same string */
9069 Py_INCREF(unicode);
9070 return unicode;
9071 }
Victor Stinner9310abb2011-10-05 00:59:23 +02009072 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009073}
9074
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009075PyObject *
9076PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9077 Py_ssize_t length)
9078{
Victor Stinnerf0124502011-11-21 23:12:56 +01009079 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009080 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009081 Py_UCS4 maxchar;
9082 enum PyUnicode_Kind kind;
9083 void *data;
9084
Victor Stinner99d7ad02012-02-22 13:37:39 +01009085 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009086 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01009087 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009088 if (ch > 127) {
9089 int decimal = Py_UNICODE_TODECIMAL(ch);
9090 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009091 ch = '0' + decimal;
Victor Stinnere6abb482012-05-02 01:15:40 +02009092 maxchar = MAX_MAXCHAR(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009093 }
9094 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009095
9096 /* Copy to a new string */
9097 decimal = PyUnicode_New(length, maxchar);
9098 if (decimal == NULL)
9099 return decimal;
9100 kind = PyUnicode_KIND(decimal);
9101 data = PyUnicode_DATA(decimal);
9102 /* Iterate over code points */
9103 for (i = 0; i < length; i++) {
9104 Py_UNICODE ch = s[i];
9105 if (ch > 127) {
9106 int decimal = Py_UNICODE_TODECIMAL(ch);
9107 if (decimal >= 0)
9108 ch = '0' + decimal;
9109 }
9110 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009111 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009112 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009113}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009114/* --- Decimal Encoder ---------------------------------------------------- */
9115
Alexander Belopolsky40018472011-02-26 01:02:56 +00009116int
9117PyUnicode_EncodeDecimal(Py_UNICODE *s,
9118 Py_ssize_t length,
9119 char *output,
9120 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009121{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009122 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009123 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009124 enum PyUnicode_Kind kind;
9125 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009126
9127 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009128 PyErr_BadArgument();
9129 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009130 }
9131
Victor Stinner42bf7752011-11-21 22:52:58 +01009132 unicode = PyUnicode_FromUnicode(s, length);
9133 if (unicode == NULL)
9134 return -1;
9135
Benjamin Petersonbac79492012-01-14 13:34:47 -05009136 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01009137 Py_DECREF(unicode);
9138 return -1;
9139 }
Victor Stinner42bf7752011-11-21 22:52:58 +01009140 kind = PyUnicode_KIND(unicode);
9141 data = PyUnicode_DATA(unicode);
9142
Victor Stinnerb84d7232011-11-22 01:50:07 +01009143 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009144 PyObject *exc;
9145 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009146 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009147 Py_ssize_t startpos;
9148
9149 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009150
Benjamin Peterson29060642009-01-31 22:14:21 +00009151 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009152 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009153 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009154 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009155 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009156 decimal = Py_UNICODE_TODECIMAL(ch);
9157 if (decimal >= 0) {
9158 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009159 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009160 continue;
9161 }
9162 if (0 < ch && ch < 256) {
9163 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009164 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009165 continue;
9166 }
Victor Stinner6345be92011-11-25 20:09:01 +01009167
Victor Stinner42bf7752011-11-21 22:52:58 +01009168 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009169 exc = NULL;
9170 raise_encode_exception(&exc, "decimal", unicode,
9171 startpos, startpos+1,
9172 "invalid decimal Unicode string");
9173 Py_XDECREF(exc);
9174 Py_DECREF(unicode);
9175 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009176 }
9177 /* 0-terminate the output string */
9178 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009179 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009180 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009181}
9182
Guido van Rossumd57fd912000-03-10 22:53:23 +00009183/* --- Helpers ------------------------------------------------------------ */
9184
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009185static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02009186any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009187 Py_ssize_t start,
9188 Py_ssize_t end)
9189{
9190 int kind1, kind2, kind;
9191 void *buf1, *buf2;
9192 Py_ssize_t len1, len2, result;
9193
9194 kind1 = PyUnicode_KIND(s1);
9195 kind2 = PyUnicode_KIND(s2);
9196 kind = kind1 > kind2 ? kind1 : kind2;
9197 buf1 = PyUnicode_DATA(s1);
9198 buf2 = PyUnicode_DATA(s2);
9199 if (kind1 != kind)
9200 buf1 = _PyUnicode_AsKind(s1, kind);
9201 if (!buf1)
9202 return -2;
9203 if (kind2 != kind)
9204 buf2 = _PyUnicode_AsKind(s2, kind);
9205 if (!buf2) {
9206 if (kind1 != kind) PyMem_Free(buf1);
9207 return -2;
9208 }
9209 len1 = PyUnicode_GET_LENGTH(s1);
9210 len2 = PyUnicode_GET_LENGTH(s2);
9211
Victor Stinner794d5672011-10-10 03:21:36 +02009212 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06009213 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02009214 case PyUnicode_1BYTE_KIND:
9215 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9216 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9217 else
9218 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9219 break;
9220 case PyUnicode_2BYTE_KIND:
9221 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9222 break;
9223 case PyUnicode_4BYTE_KIND:
9224 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9225 break;
9226 default:
9227 assert(0); result = -2;
9228 }
9229 }
9230 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06009231 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02009232 case PyUnicode_1BYTE_KIND:
9233 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9234 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9235 else
9236 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9237 break;
9238 case PyUnicode_2BYTE_KIND:
9239 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9240 break;
9241 case PyUnicode_4BYTE_KIND:
9242 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9243 break;
9244 default:
9245 assert(0); result = -2;
9246 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009247 }
9248
9249 if (kind1 != kind)
9250 PyMem_Free(buf1);
9251 if (kind2 != kind)
9252 PyMem_Free(buf2);
9253
9254 return result;
9255}
9256
9257Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009258_PyUnicode_InsertThousandsGrouping(
9259 PyObject *unicode, Py_ssize_t index,
9260 Py_ssize_t n_buffer,
9261 void *digits, Py_ssize_t n_digits,
9262 Py_ssize_t min_width,
9263 const char *grouping, PyObject *thousands_sep,
9264 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009265{
Victor Stinner41a863c2012-02-24 00:37:51 +01009266 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009267 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009268 Py_ssize_t thousands_sep_len;
9269 Py_ssize_t len;
9270
9271 if (unicode != NULL) {
9272 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009273 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009274 }
9275 else {
9276 kind = PyUnicode_1BYTE_KIND;
9277 data = NULL;
9278 }
9279 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9280 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9281 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9282 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009283 if (thousands_sep_kind < kind) {
9284 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9285 if (!thousands_sep_data)
9286 return -1;
9287 }
9288 else {
9289 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9290 if (!data)
9291 return -1;
9292 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009293 }
9294
Benjamin Petersonead6b532011-12-20 17:23:42 -06009295 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009296 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009297 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009298 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009299 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009300 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009301 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009302 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009303 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009304 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009305 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009306 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009307 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009308 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009309 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009310 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009311 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009312 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009313 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009314 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009315 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009316 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009317 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009318 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009319 break;
9320 default:
9321 assert(0);
9322 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009323 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009324 if (unicode != NULL && thousands_sep_kind != kind) {
9325 if (thousands_sep_kind < kind)
9326 PyMem_Free(thousands_sep_data);
9327 else
9328 PyMem_Free(data);
9329 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009330 if (unicode == NULL) {
9331 *maxchar = 127;
9332 if (len != n_digits) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009333 *maxchar = MAX_MAXCHAR(*maxchar,
9334 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009335 }
9336 }
9337 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009338}
9339
9340
Thomas Wouters477c8d52006-05-27 19:21:47 +00009341/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009342#define ADJUST_INDICES(start, end, len) \
9343 if (end > len) \
9344 end = len; \
9345 else if (end < 0) { \
9346 end += len; \
9347 if (end < 0) \
9348 end = 0; \
9349 } \
9350 if (start < 0) { \
9351 start += len; \
9352 if (start < 0) \
9353 start = 0; \
9354 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009355
Alexander Belopolsky40018472011-02-26 01:02:56 +00009356Py_ssize_t
9357PyUnicode_Count(PyObject *str,
9358 PyObject *substr,
9359 Py_ssize_t start,
9360 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009361{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009362 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009363 PyObject* str_obj;
9364 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009365 int kind1, kind2, kind;
9366 void *buf1 = NULL, *buf2 = NULL;
9367 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009368
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009369 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009370 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00009371 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009372 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009373 if (!sub_obj) {
9374 Py_DECREF(str_obj);
9375 return -1;
9376 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06009377 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06009378 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00009379 Py_DECREF(str_obj);
9380 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009381 }
Tim Petersced69f82003-09-16 20:30:58 +00009382
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009383 kind1 = PyUnicode_KIND(str_obj);
9384 kind2 = PyUnicode_KIND(sub_obj);
9385 kind = kind1 > kind2 ? kind1 : kind2;
9386 buf1 = PyUnicode_DATA(str_obj);
9387 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009388 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009389 if (!buf1)
9390 goto onError;
9391 buf2 = PyUnicode_DATA(sub_obj);
9392 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009393 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009394 if (!buf2)
9395 goto onError;
9396 len1 = PyUnicode_GET_LENGTH(str_obj);
9397 len2 = PyUnicode_GET_LENGTH(sub_obj);
9398
9399 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06009400 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009401 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009402 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9403 result = asciilib_count(
9404 ((Py_UCS1*)buf1) + start, end - start,
9405 buf2, len2, PY_SSIZE_T_MAX
9406 );
9407 else
9408 result = ucs1lib_count(
9409 ((Py_UCS1*)buf1) + start, end - start,
9410 buf2, len2, PY_SSIZE_T_MAX
9411 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009412 break;
9413 case PyUnicode_2BYTE_KIND:
9414 result = ucs2lib_count(
9415 ((Py_UCS2*)buf1) + start, end - start,
9416 buf2, len2, PY_SSIZE_T_MAX
9417 );
9418 break;
9419 case PyUnicode_4BYTE_KIND:
9420 result = ucs4lib_count(
9421 ((Py_UCS4*)buf1) + start, end - start,
9422 buf2, len2, PY_SSIZE_T_MAX
9423 );
9424 break;
9425 default:
9426 assert(0); result = 0;
9427 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009428
9429 Py_DECREF(sub_obj);
9430 Py_DECREF(str_obj);
9431
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009432 if (kind1 != kind)
9433 PyMem_Free(buf1);
9434 if (kind2 != kind)
9435 PyMem_Free(buf2);
9436
Guido van Rossumd57fd912000-03-10 22:53:23 +00009437 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009438 onError:
9439 Py_DECREF(sub_obj);
9440 Py_DECREF(str_obj);
9441 if (kind1 != kind && buf1)
9442 PyMem_Free(buf1);
9443 if (kind2 != kind && buf2)
9444 PyMem_Free(buf2);
9445 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009446}
9447
Alexander Belopolsky40018472011-02-26 01:02:56 +00009448Py_ssize_t
9449PyUnicode_Find(PyObject *str,
9450 PyObject *sub,
9451 Py_ssize_t start,
9452 Py_ssize_t end,
9453 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009454{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009455 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009456
Guido van Rossumd57fd912000-03-10 22:53:23 +00009457 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009458 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009459 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009460 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009461 if (!sub) {
9462 Py_DECREF(str);
9463 return -2;
9464 }
9465 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9466 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009467 Py_DECREF(str);
9468 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009469 }
Tim Petersced69f82003-09-16 20:30:58 +00009470
Victor Stinner794d5672011-10-10 03:21:36 +02009471 result = any_find_slice(direction,
9472 str, sub, start, end
9473 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009474
Guido van Rossumd57fd912000-03-10 22:53:23 +00009475 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009476 Py_DECREF(sub);
9477
Guido van Rossumd57fd912000-03-10 22:53:23 +00009478 return result;
9479}
9480
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009481Py_ssize_t
9482PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9483 Py_ssize_t start, Py_ssize_t end,
9484 int direction)
9485{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009486 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009487 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009488 if (PyUnicode_READY(str) == -1)
9489 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009490 if (start < 0 || end < 0) {
9491 PyErr_SetString(PyExc_IndexError, "string index out of range");
9492 return -2;
9493 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009494 if (end > PyUnicode_GET_LENGTH(str))
9495 end = PyUnicode_GET_LENGTH(str);
9496 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009497 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9498 kind, end-start, ch, direction);
9499 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009500 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009501 else
9502 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009503}
9504
Alexander Belopolsky40018472011-02-26 01:02:56 +00009505static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009506tailmatch(PyObject *self,
9507 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009508 Py_ssize_t start,
9509 Py_ssize_t end,
9510 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009511{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009512 int kind_self;
9513 int kind_sub;
9514 void *data_self;
9515 void *data_sub;
9516 Py_ssize_t offset;
9517 Py_ssize_t i;
9518 Py_ssize_t end_sub;
9519
9520 if (PyUnicode_READY(self) == -1 ||
9521 PyUnicode_READY(substring) == -1)
9522 return 0;
9523
9524 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009525 return 1;
9526
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009527 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9528 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009529 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009530 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009531
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009532 kind_self = PyUnicode_KIND(self);
9533 data_self = PyUnicode_DATA(self);
9534 kind_sub = PyUnicode_KIND(substring);
9535 data_sub = PyUnicode_DATA(substring);
9536 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9537
9538 if (direction > 0)
9539 offset = end;
9540 else
9541 offset = start;
9542
9543 if (PyUnicode_READ(kind_self, data_self, offset) ==
9544 PyUnicode_READ(kind_sub, data_sub, 0) &&
9545 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9546 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9547 /* If both are of the same kind, memcmp is sufficient */
9548 if (kind_self == kind_sub) {
9549 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009550 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009551 data_sub,
9552 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009553 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009554 }
9555 /* otherwise we have to compare each character by first accesing it */
9556 else {
9557 /* We do not need to compare 0 and len(substring)-1 because
9558 the if statement above ensured already that they are equal
9559 when we end up here. */
9560 // TODO: honor direction and do a forward or backwards search
9561 for (i = 1; i < end_sub; ++i) {
9562 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9563 PyUnicode_READ(kind_sub, data_sub, i))
9564 return 0;
9565 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009566 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009567 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009568 }
9569
9570 return 0;
9571}
9572
Alexander Belopolsky40018472011-02-26 01:02:56 +00009573Py_ssize_t
9574PyUnicode_Tailmatch(PyObject *str,
9575 PyObject *substr,
9576 Py_ssize_t start,
9577 Py_ssize_t end,
9578 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009579{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009580 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009581
Guido van Rossumd57fd912000-03-10 22:53:23 +00009582 str = PyUnicode_FromObject(str);
9583 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009584 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009585 substr = PyUnicode_FromObject(substr);
9586 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009587 Py_DECREF(str);
9588 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009589 }
Tim Petersced69f82003-09-16 20:30:58 +00009590
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009591 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009592 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009593 Py_DECREF(str);
9594 Py_DECREF(substr);
9595 return result;
9596}
9597
Guido van Rossumd57fd912000-03-10 22:53:23 +00009598/* Apply fixfct filter to the Unicode object self and return a
9599 reference to the modified object */
9600
Alexander Belopolsky40018472011-02-26 01:02:56 +00009601static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009602fixup(PyObject *self,
9603 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009604{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009605 PyObject *u;
9606 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009607 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009608
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009609 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009610 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009611 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009612 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009613
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009614 /* fix functions return the new maximum character in a string,
9615 if the kind of the resulting unicode object does not change,
9616 everything is fine. Otherwise we need to change the string kind
9617 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009618 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009619
9620 if (maxchar_new == 0) {
9621 /* no changes */;
9622 if (PyUnicode_CheckExact(self)) {
9623 Py_DECREF(u);
9624 Py_INCREF(self);
9625 return self;
9626 }
9627 else
9628 return u;
9629 }
9630
Victor Stinnere6abb482012-05-02 01:15:40 +02009631 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009632
Victor Stinnereaab6042011-12-11 22:22:39 +01009633 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009634 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009635
9636 /* In case the maximum character changed, we need to
9637 convert the string to the new category. */
9638 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9639 if (v == NULL) {
9640 Py_DECREF(u);
9641 return NULL;
9642 }
9643 if (maxchar_new > maxchar_old) {
9644 /* If the maxchar increased so that the kind changed, not all
9645 characters are representable anymore and we need to fix the
9646 string again. This only happens in very few cases. */
9647 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
9648 maxchar_old = fixfct(v);
9649 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009650 }
9651 else {
Victor Stinnereaab6042011-12-11 22:22:39 +01009652 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009653 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009654 Py_DECREF(u);
9655 assert(_PyUnicode_CheckConsistency(v, 1));
9656 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009657}
9658
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009659static PyObject *
9660ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009661{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009662 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9663 char *resdata, *data = PyUnicode_DATA(self);
9664 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009665
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009666 res = PyUnicode_New(len, 127);
9667 if (res == NULL)
9668 return NULL;
9669 resdata = PyUnicode_DATA(res);
9670 if (lower)
9671 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009672 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009673 _Py_bytes_upper(resdata, data, len);
9674 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009675}
9676
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009677static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009678handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009679{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009680 Py_ssize_t j;
9681 int final_sigma;
9682 Py_UCS4 c;
9683 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009684
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009685 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9686
9687 where ! is a negation and \p{xxx} is a character with property xxx.
9688 */
9689 for (j = i - 1; j >= 0; j--) {
9690 c = PyUnicode_READ(kind, data, j);
9691 if (!_PyUnicode_IsCaseIgnorable(c))
9692 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009693 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009694 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9695 if (final_sigma) {
9696 for (j = i + 1; j < length; j++) {
9697 c = PyUnicode_READ(kind, data, j);
9698 if (!_PyUnicode_IsCaseIgnorable(c))
9699 break;
9700 }
9701 final_sigma = j == length || !_PyUnicode_IsCased(c);
9702 }
9703 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009704}
9705
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009706static int
9707lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9708 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009709{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009710 /* Obscure special case. */
9711 if (c == 0x3A3) {
9712 mapped[0] = handle_capital_sigma(kind, data, length, i);
9713 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009714 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009715 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009716}
9717
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009718static Py_ssize_t
9719do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009720{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009721 Py_ssize_t i, k = 0;
9722 int n_res, j;
9723 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009724
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009725 c = PyUnicode_READ(kind, data, 0);
9726 n_res = _PyUnicode_ToUpperFull(c, mapped);
9727 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009728 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009729 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009730 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009731 for (i = 1; i < length; i++) {
9732 c = PyUnicode_READ(kind, data, i);
9733 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9734 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009735 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009736 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009737 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009738 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009739 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009740}
9741
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009742static Py_ssize_t
9743do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9744 Py_ssize_t i, k = 0;
9745
9746 for (i = 0; i < length; i++) {
9747 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9748 int n_res, j;
9749 if (Py_UNICODE_ISUPPER(c)) {
9750 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9751 }
9752 else if (Py_UNICODE_ISLOWER(c)) {
9753 n_res = _PyUnicode_ToUpperFull(c, mapped);
9754 }
9755 else {
9756 n_res = 1;
9757 mapped[0] = c;
9758 }
9759 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009760 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009761 res[k++] = mapped[j];
9762 }
9763 }
9764 return k;
9765}
9766
9767static Py_ssize_t
9768do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9769 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009770{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009771 Py_ssize_t i, k = 0;
9772
9773 for (i = 0; i < length; i++) {
9774 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9775 int n_res, j;
9776 if (lower)
9777 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9778 else
9779 n_res = _PyUnicode_ToUpperFull(c, mapped);
9780 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009781 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009782 res[k++] = mapped[j];
9783 }
9784 }
9785 return k;
9786}
9787
9788static Py_ssize_t
9789do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9790{
9791 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9792}
9793
9794static Py_ssize_t
9795do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9796{
9797 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9798}
9799
Benjamin Petersone51757f2012-01-12 21:10:29 -05009800static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009801do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9802{
9803 Py_ssize_t i, k = 0;
9804
9805 for (i = 0; i < length; i++) {
9806 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9807 Py_UCS4 mapped[3];
9808 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9809 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009810 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009811 res[k++] = mapped[j];
9812 }
9813 }
9814 return k;
9815}
9816
9817static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009818do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9819{
9820 Py_ssize_t i, k = 0;
9821 int previous_is_cased;
9822
9823 previous_is_cased = 0;
9824 for (i = 0; i < length; i++) {
9825 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9826 Py_UCS4 mapped[3];
9827 int n_res, j;
9828
9829 if (previous_is_cased)
9830 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9831 else
9832 n_res = _PyUnicode_ToTitleFull(c, mapped);
9833
9834 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009835 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009836 res[k++] = mapped[j];
9837 }
9838
9839 previous_is_cased = _PyUnicode_IsCased(c);
9840 }
9841 return k;
9842}
9843
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009844static PyObject *
9845case_operation(PyObject *self,
9846 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9847{
9848 PyObject *res = NULL;
9849 Py_ssize_t length, newlength = 0;
9850 int kind, outkind;
9851 void *data, *outdata;
9852 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9853
Benjamin Petersoneea48462012-01-16 14:28:50 -05009854 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009855
9856 kind = PyUnicode_KIND(self);
9857 data = PyUnicode_DATA(self);
9858 length = PyUnicode_GET_LENGTH(self);
9859 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9860 if (tmp == NULL)
9861 return PyErr_NoMemory();
9862 newlength = perform(kind, data, length, tmp, &maxchar);
9863 res = PyUnicode_New(newlength, maxchar);
9864 if (res == NULL)
9865 goto leave;
9866 tmpend = tmp + newlength;
9867 outdata = PyUnicode_DATA(res);
9868 outkind = PyUnicode_KIND(res);
9869 switch (outkind) {
9870 case PyUnicode_1BYTE_KIND:
9871 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9872 break;
9873 case PyUnicode_2BYTE_KIND:
9874 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9875 break;
9876 case PyUnicode_4BYTE_KIND:
9877 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9878 break;
9879 default:
9880 assert(0);
9881 break;
9882 }
9883 leave:
9884 PyMem_FREE(tmp);
9885 return res;
9886}
9887
Tim Peters8ce9f162004-08-27 01:49:32 +00009888PyObject *
9889PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009890{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009891 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009892 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009893 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009894 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009895 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9896 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009897 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009898 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009899 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009900 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009901 int use_memcpy;
9902 unsigned char *res_data = NULL, *sep_data = NULL;
9903 PyObject *last_obj;
9904 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009905
Tim Peters05eba1f2004-08-27 21:32:02 +00009906 fseq = PySequence_Fast(seq, "");
9907 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009908 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009909 }
9910
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009911 /* NOTE: the following code can't call back into Python code,
9912 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009913 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009914
Tim Peters05eba1f2004-08-27 21:32:02 +00009915 seqlen = PySequence_Fast_GET_SIZE(fseq);
9916 /* If empty sequence, return u"". */
9917 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009918 Py_DECREF(fseq);
9919 Py_INCREF(unicode_empty);
9920 res = unicode_empty;
9921 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009922 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009923
Tim Peters05eba1f2004-08-27 21:32:02 +00009924 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009925 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009926 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009927 if (seqlen == 1) {
9928 if (PyUnicode_CheckExact(items[0])) {
9929 res = items[0];
9930 Py_INCREF(res);
9931 Py_DECREF(fseq);
9932 return res;
9933 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009934 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009935 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009936 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009937 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009938 /* Set up sep and seplen */
9939 if (separator == NULL) {
9940 /* fall back to a blank space separator */
9941 sep = PyUnicode_FromOrdinal(' ');
9942 if (!sep)
9943 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009944 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009945 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009946 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009947 else {
9948 if (!PyUnicode_Check(separator)) {
9949 PyErr_Format(PyExc_TypeError,
9950 "separator: expected str instance,"
9951 " %.80s found",
9952 Py_TYPE(separator)->tp_name);
9953 goto onError;
9954 }
9955 if (PyUnicode_READY(separator))
9956 goto onError;
9957 sep = separator;
9958 seplen = PyUnicode_GET_LENGTH(separator);
9959 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9960 /* inc refcount to keep this code path symmetric with the
9961 above case of a blank separator */
9962 Py_INCREF(sep);
9963 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009964 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009965 }
9966
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009967 /* There are at least two things to join, or else we have a subclass
9968 * of str in the sequence.
9969 * Do a pre-pass to figure out the total amount of space we'll
9970 * need (sz), and see whether all argument are strings.
9971 */
9972 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009973#ifdef Py_DEBUG
9974 use_memcpy = 0;
9975#else
9976 use_memcpy = 1;
9977#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009978 for (i = 0; i < seqlen; i++) {
9979 const Py_ssize_t old_sz = sz;
9980 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009981 if (!PyUnicode_Check(item)) {
9982 PyErr_Format(PyExc_TypeError,
9983 "sequence item %zd: expected str instance,"
9984 " %.80s found",
9985 i, Py_TYPE(item)->tp_name);
9986 goto onError;
9987 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009988 if (PyUnicode_READY(item) == -1)
9989 goto onError;
9990 sz += PyUnicode_GET_LENGTH(item);
9991 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnere6abb482012-05-02 01:15:40 +02009992 maxchar = MAX_MAXCHAR(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009993 if (i != 0)
9994 sz += seplen;
9995 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9996 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009997 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009998 goto onError;
9999 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010000 if (use_memcpy && last_obj != NULL) {
10001 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10002 use_memcpy = 0;
10003 }
10004 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010005 }
Tim Petersced69f82003-09-16 20:30:58 +000010006
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010007 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010008 if (res == NULL)
10009 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010010
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010011 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010012#ifdef Py_DEBUG
10013 use_memcpy = 0;
10014#else
10015 if (use_memcpy) {
10016 res_data = PyUnicode_1BYTE_DATA(res);
10017 kind = PyUnicode_KIND(res);
10018 if (seplen != 0)
10019 sep_data = PyUnicode_1BYTE_DATA(sep);
10020 }
10021#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010022 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010023 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010024 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010025 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +020010026 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +020010027 if (use_memcpy) {
10028 Py_MEMCPY(res_data,
10029 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010030 kind * seplen);
10031 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010032 }
10033 else {
10034 copy_characters(res, res_offset, sep, 0, seplen);
10035 res_offset += seplen;
10036 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010037 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010038 itemlen = PyUnicode_GET_LENGTH(item);
10039 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +020010040 if (use_memcpy) {
10041 Py_MEMCPY(res_data,
10042 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010043 kind * itemlen);
10044 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010045 }
10046 else {
10047 copy_characters(res, res_offset, item, 0, itemlen);
10048 res_offset += itemlen;
10049 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010050 }
Tim Peters05eba1f2004-08-27 21:32:02 +000010051 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010052 if (use_memcpy)
10053 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010054 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +020010055 else
10056 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +000010057
Tim Peters05eba1f2004-08-27 21:32:02 +000010058 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010059 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010060 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010061 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010062
Benjamin Peterson29060642009-01-31 22:14:21 +000010063 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +000010064 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010065 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010066 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010067 return NULL;
10068}
10069
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010070#define FILL(kind, data, value, start, length) \
10071 do { \
10072 Py_ssize_t i_ = 0; \
10073 assert(kind != PyUnicode_WCHAR_KIND); \
10074 switch ((kind)) { \
10075 case PyUnicode_1BYTE_KIND: { \
10076 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020010077 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010078 break; \
10079 } \
10080 case PyUnicode_2BYTE_KIND: { \
10081 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
10082 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10083 break; \
10084 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010085 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010086 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
10087 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10088 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010089 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010090 } \
10091 } \
10092 } while (0)
10093
Victor Stinner3fe55312012-01-04 00:33:50 +010010094Py_ssize_t
10095PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10096 Py_UCS4 fill_char)
10097{
10098 Py_ssize_t maxlen;
10099 enum PyUnicode_Kind kind;
10100 void *data;
10101
10102 if (!PyUnicode_Check(unicode)) {
10103 PyErr_BadInternalCall();
10104 return -1;
10105 }
10106 if (PyUnicode_READY(unicode) == -1)
10107 return -1;
10108 if (unicode_check_modifiable(unicode))
10109 return -1;
10110
10111 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10112 PyErr_SetString(PyExc_ValueError,
10113 "fill character is bigger than "
10114 "the string maximum character");
10115 return -1;
10116 }
10117
10118 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10119 length = Py_MIN(maxlen, length);
10120 if (length <= 0)
10121 return 0;
10122
10123 kind = PyUnicode_KIND(unicode);
10124 data = PyUnicode_DATA(unicode);
10125 FILL(kind, data, fill_char, start, length);
10126 return length;
10127}
10128
Victor Stinner9310abb2011-10-05 00:59:23 +020010129static PyObject *
10130pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010131 Py_ssize_t left,
10132 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010133 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010134{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010135 PyObject *u;
10136 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010137 int kind;
10138 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010139
10140 if (left < 0)
10141 left = 0;
10142 if (right < 0)
10143 right = 0;
10144
Victor Stinnerc4b49542011-12-11 22:44:26 +010010145 if (left == 0 && right == 0)
10146 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010147
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010148 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10149 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010150 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10151 return NULL;
10152 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010153 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Victor Stinnere6abb482012-05-02 01:15:40 +020010154 maxchar = MAX_MAXCHAR(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010155 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010156 if (!u)
10157 return NULL;
10158
10159 kind = PyUnicode_KIND(u);
10160 data = PyUnicode_DATA(u);
10161 if (left)
10162 FILL(kind, data, fill, 0, left);
10163 if (right)
10164 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010165 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010166 assert(_PyUnicode_CheckConsistency(u, 1));
10167 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010168}
10169
Alexander Belopolsky40018472011-02-26 01:02:56 +000010170PyObject *
10171PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010172{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010173 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010174
10175 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010176 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010177 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060010178 if (PyUnicode_READY(string) == -1) {
10179 Py_DECREF(string);
10180 return NULL;
10181 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010182
Benjamin Petersonead6b532011-12-20 17:23:42 -060010183 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010184 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010185 if (PyUnicode_IS_ASCII(string))
10186 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010187 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010188 PyUnicode_GET_LENGTH(string), keepends);
10189 else
10190 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010191 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010192 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010193 break;
10194 case PyUnicode_2BYTE_KIND:
10195 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010196 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010197 PyUnicode_GET_LENGTH(string), keepends);
10198 break;
10199 case PyUnicode_4BYTE_KIND:
10200 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010201 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010202 PyUnicode_GET_LENGTH(string), keepends);
10203 break;
10204 default:
10205 assert(0);
10206 list = 0;
10207 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010208 Py_DECREF(string);
10209 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010210}
10211
Alexander Belopolsky40018472011-02-26 01:02:56 +000010212static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010213split(PyObject *self,
10214 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010215 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010216{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010217 int kind1, kind2, kind;
10218 void *buf1, *buf2;
10219 Py_ssize_t len1, len2;
10220 PyObject* out;
10221
Guido van Rossumd57fd912000-03-10 22:53:23 +000010222 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010223 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010224
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010225 if (PyUnicode_READY(self) == -1)
10226 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010227
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010228 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010229 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010230 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010231 if (PyUnicode_IS_ASCII(self))
10232 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010233 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010234 PyUnicode_GET_LENGTH(self), maxcount
10235 );
10236 else
10237 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010238 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010239 PyUnicode_GET_LENGTH(self), maxcount
10240 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010241 case PyUnicode_2BYTE_KIND:
10242 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010243 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010244 PyUnicode_GET_LENGTH(self), maxcount
10245 );
10246 case PyUnicode_4BYTE_KIND:
10247 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010248 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010249 PyUnicode_GET_LENGTH(self), maxcount
10250 );
10251 default:
10252 assert(0);
10253 return NULL;
10254 }
10255
10256 if (PyUnicode_READY(substring) == -1)
10257 return NULL;
10258
10259 kind1 = PyUnicode_KIND(self);
10260 kind2 = PyUnicode_KIND(substring);
10261 kind = kind1 > kind2 ? kind1 : kind2;
10262 buf1 = PyUnicode_DATA(self);
10263 buf2 = PyUnicode_DATA(substring);
10264 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010265 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010266 if (!buf1)
10267 return NULL;
10268 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010269 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010270 if (!buf2) {
10271 if (kind1 != kind) PyMem_Free(buf1);
10272 return NULL;
10273 }
10274 len1 = PyUnicode_GET_LENGTH(self);
10275 len2 = PyUnicode_GET_LENGTH(substring);
10276
Benjamin Petersonead6b532011-12-20 17:23:42 -060010277 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010278 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010279 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10280 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010281 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010282 else
10283 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010284 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010285 break;
10286 case PyUnicode_2BYTE_KIND:
10287 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010288 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010289 break;
10290 case PyUnicode_4BYTE_KIND:
10291 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010292 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010293 break;
10294 default:
10295 out = NULL;
10296 }
10297 if (kind1 != kind)
10298 PyMem_Free(buf1);
10299 if (kind2 != kind)
10300 PyMem_Free(buf2);
10301 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010302}
10303
Alexander Belopolsky40018472011-02-26 01:02:56 +000010304static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010305rsplit(PyObject *self,
10306 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010307 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010308{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010309 int kind1, kind2, kind;
10310 void *buf1, *buf2;
10311 Py_ssize_t len1, len2;
10312 PyObject* out;
10313
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010314 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010315 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010316
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010317 if (PyUnicode_READY(self) == -1)
10318 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010319
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010320 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010321 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010322 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010323 if (PyUnicode_IS_ASCII(self))
10324 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010325 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010326 PyUnicode_GET_LENGTH(self), maxcount
10327 );
10328 else
10329 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010330 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010331 PyUnicode_GET_LENGTH(self), maxcount
10332 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010333 case PyUnicode_2BYTE_KIND:
10334 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010335 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010336 PyUnicode_GET_LENGTH(self), maxcount
10337 );
10338 case PyUnicode_4BYTE_KIND:
10339 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010340 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010341 PyUnicode_GET_LENGTH(self), maxcount
10342 );
10343 default:
10344 assert(0);
10345 return NULL;
10346 }
10347
10348 if (PyUnicode_READY(substring) == -1)
10349 return NULL;
10350
10351 kind1 = PyUnicode_KIND(self);
10352 kind2 = PyUnicode_KIND(substring);
10353 kind = kind1 > kind2 ? kind1 : kind2;
10354 buf1 = PyUnicode_DATA(self);
10355 buf2 = PyUnicode_DATA(substring);
10356 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010357 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010358 if (!buf1)
10359 return NULL;
10360 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010361 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010362 if (!buf2) {
10363 if (kind1 != kind) PyMem_Free(buf1);
10364 return NULL;
10365 }
10366 len1 = PyUnicode_GET_LENGTH(self);
10367 len2 = PyUnicode_GET_LENGTH(substring);
10368
Benjamin Petersonead6b532011-12-20 17:23:42 -060010369 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010370 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010371 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10372 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010373 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010374 else
10375 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010376 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010377 break;
10378 case PyUnicode_2BYTE_KIND:
10379 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010380 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010381 break;
10382 case PyUnicode_4BYTE_KIND:
10383 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010384 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010385 break;
10386 default:
10387 out = NULL;
10388 }
10389 if (kind1 != kind)
10390 PyMem_Free(buf1);
10391 if (kind2 != kind)
10392 PyMem_Free(buf2);
10393 return out;
10394}
10395
10396static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010397anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10398 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010399{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010400 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010401 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010402 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10403 return asciilib_find(buf1, len1, buf2, len2, offset);
10404 else
10405 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010406 case PyUnicode_2BYTE_KIND:
10407 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10408 case PyUnicode_4BYTE_KIND:
10409 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10410 }
10411 assert(0);
10412 return -1;
10413}
10414
10415static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010416anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10417 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010418{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010419 switch (kind) {
10420 case PyUnicode_1BYTE_KIND:
10421 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10422 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10423 else
10424 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10425 case PyUnicode_2BYTE_KIND:
10426 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10427 case PyUnicode_4BYTE_KIND:
10428 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10429 }
10430 assert(0);
10431 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010432}
10433
Alexander Belopolsky40018472011-02-26 01:02:56 +000010434static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010435replace(PyObject *self, PyObject *str1,
10436 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010437{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010438 PyObject *u;
10439 char *sbuf = PyUnicode_DATA(self);
10440 char *buf1 = PyUnicode_DATA(str1);
10441 char *buf2 = PyUnicode_DATA(str2);
10442 int srelease = 0, release1 = 0, release2 = 0;
10443 int skind = PyUnicode_KIND(self);
10444 int kind1 = PyUnicode_KIND(str1);
10445 int kind2 = PyUnicode_KIND(str2);
10446 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10447 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10448 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010449 int mayshrink;
10450 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010451
10452 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010453 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010454 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010455 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010456
Victor Stinner59de0ee2011-10-07 10:01:28 +020010457 if (str1 == str2)
10458 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010459 if (skind < kind1)
10460 /* substring too wide to be present */
10461 goto nothing;
10462
Victor Stinner49a0a212011-10-12 23:46:10 +020010463 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10464 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10465 /* Replacing str1 with str2 may cause a maxchar reduction in the
10466 result string. */
10467 mayshrink = (maxchar_str2 < maxchar);
Victor Stinnere6abb482012-05-02 01:15:40 +020010468 maxchar = MAX_MAXCHAR(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010469
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010470 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010471 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010472 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010473 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010474 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010475 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010476 Py_UCS4 u1, u2;
10477 int rkind;
Victor Stinnerf6441102011-12-18 02:43:08 +010010478 Py_ssize_t index, pos;
10479 char *src;
10480
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010481 u1 = PyUnicode_READ_CHAR(str1, 0);
Victor Stinnerf6441102011-12-18 02:43:08 +010010482 pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1);
10483 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010484 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010485 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010486 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010487 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010488 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010489 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010490 rkind = PyUnicode_KIND(u);
Victor Stinnerf6441102011-12-18 02:43:08 +010010491
10492 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2);
10493 index = 0;
10494 src = sbuf;
10495 while (--maxcount)
10496 {
10497 pos++;
10498 src += pos * PyUnicode_KIND(self);
10499 slen -= pos;
10500 index += pos;
10501 pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1);
10502 if (pos < 0)
10503 break;
10504 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2);
10505 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010506 }
10507 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010508 int rkind = skind;
10509 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010510 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010511
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010512 if (kind1 < rkind) {
10513 /* widen substring */
10514 buf1 = _PyUnicode_AsKind(str1, rkind);
10515 if (!buf1) goto error;
10516 release1 = 1;
10517 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010518 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010519 if (i < 0)
10520 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010521 if (rkind > kind2) {
10522 /* widen replacement */
10523 buf2 = _PyUnicode_AsKind(str2, rkind);
10524 if (!buf2) goto error;
10525 release2 = 1;
10526 }
10527 else if (rkind < kind2) {
10528 /* widen self and buf1 */
10529 rkind = kind2;
10530 if (release1) PyMem_Free(buf1);
10531 sbuf = _PyUnicode_AsKind(self, rkind);
10532 if (!sbuf) goto error;
10533 srelease = 1;
10534 buf1 = _PyUnicode_AsKind(str1, rkind);
10535 if (!buf1) goto error;
10536 release1 = 1;
10537 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010538 u = PyUnicode_New(slen, maxchar);
10539 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010540 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010541 assert(PyUnicode_KIND(u) == rkind);
10542 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010543
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010544 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010545 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010546 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010547 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010548 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010549 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010550
10551 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010552 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010553 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010554 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010555 if (i == -1)
10556 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010557 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010558 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010559 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010560 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010561 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010562 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010563 }
10564 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010565 Py_ssize_t n, i, j, ires;
10566 Py_ssize_t product, new_size;
10567 int rkind = skind;
10568 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010569
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010570 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010571 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010572 buf1 = _PyUnicode_AsKind(str1, rkind);
10573 if (!buf1) goto error;
10574 release1 = 1;
10575 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010576 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010577 if (n == 0)
10578 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010579 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010580 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010581 buf2 = _PyUnicode_AsKind(str2, rkind);
10582 if (!buf2) goto error;
10583 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010584 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010585 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010586 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010587 rkind = kind2;
10588 sbuf = _PyUnicode_AsKind(self, rkind);
10589 if (!sbuf) goto error;
10590 srelease = 1;
10591 if (release1) PyMem_Free(buf1);
10592 buf1 = _PyUnicode_AsKind(str1, rkind);
10593 if (!buf1) goto error;
10594 release1 = 1;
10595 }
10596 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10597 PyUnicode_GET_LENGTH(str1))); */
10598 product = n * (len2-len1);
10599 if ((product / (len2-len1)) != n) {
10600 PyErr_SetString(PyExc_OverflowError,
10601 "replace string is too long");
10602 goto error;
10603 }
10604 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +020010605 if (new_size == 0) {
10606 Py_INCREF(unicode_empty);
10607 u = unicode_empty;
10608 goto done;
10609 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010610 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10611 PyErr_SetString(PyExc_OverflowError,
10612 "replace string is too long");
10613 goto error;
10614 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010615 u = PyUnicode_New(new_size, maxchar);
10616 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010617 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010618 assert(PyUnicode_KIND(u) == rkind);
10619 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010620 ires = i = 0;
10621 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010622 while (n-- > 0) {
10623 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010624 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010625 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010626 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010627 if (j == -1)
10628 break;
10629 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010630 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010631 memcpy(res + rkind * ires,
10632 sbuf + rkind * i,
10633 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010634 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010635 }
10636 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010637 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010638 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010639 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010640 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010641 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010642 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010643 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010644 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010645 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010646 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010647 memcpy(res + rkind * ires,
10648 sbuf + rkind * i,
10649 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010650 }
10651 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010652 /* interleave */
10653 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010654 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010655 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010656 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010657 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010658 if (--n <= 0)
10659 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010660 memcpy(res + rkind * ires,
10661 sbuf + rkind * i,
10662 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010663 ires++;
10664 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010665 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010666 memcpy(res + rkind * ires,
10667 sbuf + rkind * i,
10668 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010669 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010670 }
10671
10672 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010673 unicode_adjust_maxchar(&u);
10674 if (u == NULL)
10675 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010676 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010677
10678 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010679 if (srelease)
10680 PyMem_FREE(sbuf);
10681 if (release1)
10682 PyMem_FREE(buf1);
10683 if (release2)
10684 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010685 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010686 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010687
Benjamin Peterson29060642009-01-31 22:14:21 +000010688 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010689 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010690 if (srelease)
10691 PyMem_FREE(sbuf);
10692 if (release1)
10693 PyMem_FREE(buf1);
10694 if (release2)
10695 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010696 return unicode_result_unchanged(self);
10697
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010698 error:
10699 if (srelease && sbuf)
10700 PyMem_FREE(sbuf);
10701 if (release1 && buf1)
10702 PyMem_FREE(buf1);
10703 if (release2 && buf2)
10704 PyMem_FREE(buf2);
10705 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010706}
10707
10708/* --- Unicode Object Methods --------------------------------------------- */
10709
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010710PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010711 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010712\n\
10713Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010714characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010715
10716static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010717unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010718{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010719 if (PyUnicode_READY(self) == -1)
10720 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010721 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010722}
10723
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010724PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010725 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010726\n\
10727Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010728have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010729
10730static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010731unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010732{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010733 if (PyUnicode_READY(self) == -1)
10734 return NULL;
10735 if (PyUnicode_GET_LENGTH(self) == 0)
10736 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010737 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010738}
10739
Benjamin Petersond5890c82012-01-14 13:23:30 -050010740PyDoc_STRVAR(casefold__doc__,
10741 "S.casefold() -> str\n\
10742\n\
10743Return a version of S suitable for caseless comparisons.");
10744
10745static PyObject *
10746unicode_casefold(PyObject *self)
10747{
10748 if (PyUnicode_READY(self) == -1)
10749 return NULL;
10750 if (PyUnicode_IS_ASCII(self))
10751 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010752 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010753}
10754
10755
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010756/* Argument converter. Coerces to a single unicode character */
10757
10758static int
10759convert_uc(PyObject *obj, void *addr)
10760{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010761 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010762 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010763
Benjamin Peterson14339b62009-01-31 16:36:08 +000010764 uniobj = PyUnicode_FromObject(obj);
10765 if (uniobj == NULL) {
10766 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010767 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010768 return 0;
10769 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010770 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010771 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010772 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010773 Py_DECREF(uniobj);
10774 return 0;
10775 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010776 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010777 Py_DECREF(uniobj);
10778 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010779}
10780
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010781PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010782 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010783\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010784Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010785done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010786
10787static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010788unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010789{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010790 Py_ssize_t marg, left;
10791 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010792 Py_UCS4 fillchar = ' ';
10793
Victor Stinnere9a29352011-10-01 02:14:59 +020010794 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010795 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010796
Benjamin Petersonbac79492012-01-14 13:34:47 -050010797 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010798 return NULL;
10799
Victor Stinnerc4b49542011-12-11 22:44:26 +010010800 if (PyUnicode_GET_LENGTH(self) >= width)
10801 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010802
Victor Stinnerc4b49542011-12-11 22:44:26 +010010803 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010804 left = marg / 2 + (marg & width & 1);
10805
Victor Stinner9310abb2011-10-05 00:59:23 +020010806 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010807}
10808
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010809/* This function assumes that str1 and str2 are readied by the caller. */
10810
Marc-André Lemburge5034372000-08-08 08:04:29 +000010811static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010812unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010813{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010814 int kind1, kind2;
10815 void *data1, *data2;
10816 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010817
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010818 kind1 = PyUnicode_KIND(str1);
10819 kind2 = PyUnicode_KIND(str2);
10820 data1 = PyUnicode_DATA(str1);
10821 data2 = PyUnicode_DATA(str2);
10822 len1 = PyUnicode_GET_LENGTH(str1);
10823 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010824
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010825 for (i = 0; i < len1 && i < len2; ++i) {
10826 Py_UCS4 c1, c2;
10827 c1 = PyUnicode_READ(kind1, data1, i);
10828 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010829
10830 if (c1 != c2)
10831 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010832 }
10833
10834 return (len1 < len2) ? -1 : (len1 != len2);
10835}
10836
Alexander Belopolsky40018472011-02-26 01:02:56 +000010837int
10838PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010839{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010840 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10841 if (PyUnicode_READY(left) == -1 ||
10842 PyUnicode_READY(right) == -1)
10843 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010844 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010845 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010846 PyErr_Format(PyExc_TypeError,
10847 "Can't compare %.100s and %.100s",
10848 left->ob_type->tp_name,
10849 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010850 return -1;
10851}
10852
Martin v. Löwis5b222132007-06-10 09:51:05 +000010853int
10854PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10855{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010856 Py_ssize_t i;
10857 int kind;
10858 void *data;
10859 Py_UCS4 chr;
10860
Victor Stinner910337b2011-10-03 03:20:16 +020010861 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010862 if (PyUnicode_READY(uni) == -1)
10863 return -1;
10864 kind = PyUnicode_KIND(uni);
10865 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010866 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010867 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10868 if (chr != str[i])
10869 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010870 /* This check keeps Python strings that end in '\0' from comparing equal
10871 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010872 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010873 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010874 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010875 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010876 return 0;
10877}
10878
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010879
Benjamin Peterson29060642009-01-31 22:14:21 +000010880#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010881 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010882
Alexander Belopolsky40018472011-02-26 01:02:56 +000010883PyObject *
10884PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010885{
10886 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010887
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010888 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10889 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010890 if (PyUnicode_READY(left) == -1 ||
10891 PyUnicode_READY(right) == -1)
10892 return NULL;
10893 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10894 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010895 if (op == Py_EQ) {
10896 Py_INCREF(Py_False);
10897 return Py_False;
10898 }
10899 if (op == Py_NE) {
10900 Py_INCREF(Py_True);
10901 return Py_True;
10902 }
10903 }
10904 if (left == right)
10905 result = 0;
10906 else
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010907 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010908
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010909 /* Convert the return value to a Boolean */
10910 switch (op) {
10911 case Py_EQ:
10912 v = TEST_COND(result == 0);
10913 break;
10914 case Py_NE:
10915 v = TEST_COND(result != 0);
10916 break;
10917 case Py_LE:
10918 v = TEST_COND(result <= 0);
10919 break;
10920 case Py_GE:
10921 v = TEST_COND(result >= 0);
10922 break;
10923 case Py_LT:
10924 v = TEST_COND(result == -1);
10925 break;
10926 case Py_GT:
10927 v = TEST_COND(result == 1);
10928 break;
10929 default:
10930 PyErr_BadArgument();
10931 return NULL;
10932 }
10933 Py_INCREF(v);
10934 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010935 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010936
Brian Curtindfc80e32011-08-10 20:28:54 -050010937 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010938}
10939
Alexander Belopolsky40018472011-02-26 01:02:56 +000010940int
10941PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010942{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010943 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010944 int kind1, kind2, kind;
10945 void *buf1, *buf2;
10946 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010947 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010948
10949 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010950 sub = PyUnicode_FromObject(element);
10951 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010952 PyErr_Format(PyExc_TypeError,
10953 "'in <string>' requires string as left operand, not %s",
10954 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010955 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010956 }
10957
Thomas Wouters477c8d52006-05-27 19:21:47 +000010958 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010959 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010960 Py_DECREF(sub);
10961 return -1;
10962 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060010963 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
10964 Py_DECREF(sub);
10965 Py_DECREF(str);
10966 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010967
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010968 kind1 = PyUnicode_KIND(str);
10969 kind2 = PyUnicode_KIND(sub);
10970 kind = kind1 > kind2 ? kind1 : kind2;
10971 buf1 = PyUnicode_DATA(str);
10972 buf2 = PyUnicode_DATA(sub);
10973 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010974 buf1 = _PyUnicode_AsKind(str, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010975 if (!buf1) {
10976 Py_DECREF(sub);
10977 return -1;
10978 }
10979 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010980 buf2 = _PyUnicode_AsKind(sub, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010981 if (!buf2) {
10982 Py_DECREF(sub);
10983 if (kind1 != kind) PyMem_Free(buf1);
10984 return -1;
10985 }
10986 len1 = PyUnicode_GET_LENGTH(str);
10987 len2 = PyUnicode_GET_LENGTH(sub);
10988
Benjamin Petersonead6b532011-12-20 17:23:42 -060010989 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010990 case PyUnicode_1BYTE_KIND:
10991 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10992 break;
10993 case PyUnicode_2BYTE_KIND:
10994 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10995 break;
10996 case PyUnicode_4BYTE_KIND:
10997 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10998 break;
10999 default:
11000 result = -1;
11001 assert(0);
11002 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011003
11004 Py_DECREF(str);
11005 Py_DECREF(sub);
11006
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011007 if (kind1 != kind)
11008 PyMem_Free(buf1);
11009 if (kind2 != kind)
11010 PyMem_Free(buf2);
11011
Guido van Rossum403d68b2000-03-13 15:55:09 +000011012 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011013}
11014
Guido van Rossumd57fd912000-03-10 22:53:23 +000011015/* Concat to string or Unicode object giving a new Unicode object. */
11016
Alexander Belopolsky40018472011-02-26 01:02:56 +000011017PyObject *
11018PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011019{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011020 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020011021 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010011022 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011023
11024 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011025 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011026 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011027 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011028 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011029 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011030 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011031
11032 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020011033 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011034 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011035 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011036 }
Victor Stinnera464fc12011-10-02 20:39:30 +020011037 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011038 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011039 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011040 }
11041
Victor Stinner488fa492011-12-12 00:01:39 +010011042 u_len = PyUnicode_GET_LENGTH(u);
11043 v_len = PyUnicode_GET_LENGTH(v);
11044 if (u_len > PY_SSIZE_T_MAX - v_len) {
11045 PyErr_SetString(PyExc_OverflowError,
11046 "strings are too large to concat");
11047 goto onError;
11048 }
11049 new_len = u_len + v_len;
11050
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011051 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020011052 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Victor Stinnere6abb482012-05-02 01:15:40 +020011053 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011054
Guido van Rossumd57fd912000-03-10 22:53:23 +000011055 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010011056 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011057 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011058 goto onError;
Victor Stinner488fa492011-12-12 00:01:39 +010011059 copy_characters(w, 0, u, 0, u_len);
11060 copy_characters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011061 Py_DECREF(u);
11062 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011063 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011064 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011065
Benjamin Peterson29060642009-01-31 22:14:21 +000011066 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011067 Py_XDECREF(u);
11068 Py_XDECREF(v);
11069 return NULL;
11070}
11071
Walter Dörwald1ab83302007-05-18 17:15:44 +000011072void
Victor Stinner23e56682011-10-03 03:54:37 +020011073PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011074{
Victor Stinner23e56682011-10-03 03:54:37 +020011075 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011076 Py_UCS4 maxchar, maxchar2;
11077 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011078
11079 if (p_left == NULL) {
11080 if (!PyErr_Occurred())
11081 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011082 return;
11083 }
Victor Stinner23e56682011-10-03 03:54:37 +020011084 left = *p_left;
11085 if (right == NULL || !PyUnicode_Check(left)) {
11086 if (!PyErr_Occurred())
11087 PyErr_BadInternalCall();
11088 goto error;
11089 }
11090
Benjamin Petersonbac79492012-01-14 13:34:47 -050011091 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011092 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011093 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011094 goto error;
11095
Victor Stinner488fa492011-12-12 00:01:39 +010011096 /* Shortcuts */
11097 if (left == unicode_empty) {
11098 Py_DECREF(left);
11099 Py_INCREF(right);
11100 *p_left = right;
11101 return;
11102 }
11103 if (right == unicode_empty)
11104 return;
11105
11106 left_len = PyUnicode_GET_LENGTH(left);
11107 right_len = PyUnicode_GET_LENGTH(right);
11108 if (left_len > PY_SSIZE_T_MAX - right_len) {
11109 PyErr_SetString(PyExc_OverflowError,
11110 "strings are too large to concat");
11111 goto error;
11112 }
11113 new_len = left_len + right_len;
11114
11115 if (unicode_modifiable(left)
11116 && PyUnicode_CheckExact(right)
11117 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011118 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11119 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011120 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011121 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011122 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11123 {
11124 /* append inplace */
11125 if (unicode_resize(p_left, new_len) != 0) {
11126 /* XXX if _PyUnicode_Resize() fails, 'left' has been
11127 * deallocated so it cannot be put back into
11128 * 'variable'. The MemoryError is raised when there
11129 * is no value in 'variable', which might (very
11130 * remotely) be a cause of incompatibilities.
11131 */
11132 goto error;
Victor Stinner23e56682011-10-03 03:54:37 +020011133 }
Victor Stinner488fa492011-12-12 00:01:39 +010011134 /* copy 'right' into the newly allocated area of 'left' */
11135 copy_characters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011136 }
Victor Stinner488fa492011-12-12 00:01:39 +010011137 else {
11138 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11139 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Victor Stinnere6abb482012-05-02 01:15:40 +020011140 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011141
Victor Stinner488fa492011-12-12 00:01:39 +010011142 /* Concat the two Unicode strings */
11143 res = PyUnicode_New(new_len, maxchar);
11144 if (res == NULL)
11145 goto error;
11146 copy_characters(res, 0, left, 0, left_len);
11147 copy_characters(res, left_len, right, 0, right_len);
11148 Py_DECREF(left);
11149 *p_left = res;
11150 }
11151 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011152 return;
11153
11154error:
Victor Stinner488fa492011-12-12 00:01:39 +010011155 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011156}
11157
11158void
11159PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11160{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011161 PyUnicode_Append(pleft, right);
11162 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011163}
11164
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011165PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011166 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011167\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011168Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011169string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011170interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011171
11172static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011173unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011174{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011175 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011176 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011177 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011178 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011179 int kind1, kind2, kind;
11180 void *buf1, *buf2;
11181 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011182
Jesus Ceaac451502011-04-20 17:09:23 +020011183 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
11184 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011185 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011186
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011187 kind1 = PyUnicode_KIND(self);
11188 kind2 = PyUnicode_KIND(substring);
11189 kind = kind1 > kind2 ? kind1 : kind2;
11190 buf1 = PyUnicode_DATA(self);
11191 buf2 = PyUnicode_DATA(substring);
11192 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010011193 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011194 if (!buf1) {
11195 Py_DECREF(substring);
11196 return NULL;
11197 }
11198 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010011199 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011200 if (!buf2) {
11201 Py_DECREF(substring);
11202 if (kind1 != kind) PyMem_Free(buf1);
11203 return NULL;
11204 }
11205 len1 = PyUnicode_GET_LENGTH(self);
11206 len2 = PyUnicode_GET_LENGTH(substring);
11207
11208 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060011209 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011210 case PyUnicode_1BYTE_KIND:
11211 iresult = ucs1lib_count(
11212 ((Py_UCS1*)buf1) + start, end - start,
11213 buf2, len2, PY_SSIZE_T_MAX
11214 );
11215 break;
11216 case PyUnicode_2BYTE_KIND:
11217 iresult = ucs2lib_count(
11218 ((Py_UCS2*)buf1) + start, end - start,
11219 buf2, len2, PY_SSIZE_T_MAX
11220 );
11221 break;
11222 case PyUnicode_4BYTE_KIND:
11223 iresult = ucs4lib_count(
11224 ((Py_UCS4*)buf1) + start, end - start,
11225 buf2, len2, PY_SSIZE_T_MAX
11226 );
11227 break;
11228 default:
11229 assert(0); iresult = 0;
11230 }
11231
11232 result = PyLong_FromSsize_t(iresult);
11233
11234 if (kind1 != kind)
11235 PyMem_Free(buf1);
11236 if (kind2 != kind)
11237 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011238
11239 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011240
Guido van Rossumd57fd912000-03-10 22:53:23 +000011241 return result;
11242}
11243
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011244PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011245 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011246\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011247Encode S using the codec registered for encoding. Default encoding\n\
11248is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011249handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011250a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11251'xmlcharrefreplace' as well as any other name registered with\n\
11252codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011253
11254static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011255unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011256{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011257 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011258 char *encoding = NULL;
11259 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011260
Benjamin Peterson308d6372009-09-18 21:42:35 +000011261 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11262 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011263 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011264 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011265}
11266
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011267PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011268 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011269\n\
11270Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011271If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011272
11273static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011274unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011275{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011276 Py_ssize_t i, j, line_pos, src_len, incr;
11277 Py_UCS4 ch;
11278 PyObject *u;
11279 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011280 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011281 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011282 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011283
11284 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011285 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011286
Antoine Pitrou22425222011-10-04 19:10:51 +020011287 if (PyUnicode_READY(self) == -1)
11288 return NULL;
11289
Thomas Wouters7e474022000-07-16 12:04:32 +000011290 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011291 src_len = PyUnicode_GET_LENGTH(self);
11292 i = j = line_pos = 0;
11293 kind = PyUnicode_KIND(self);
11294 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011295 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011296 for (; i < src_len; i++) {
11297 ch = PyUnicode_READ(kind, src_data, i);
11298 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011299 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011300 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011301 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011302 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011303 goto overflow;
11304 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011305 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011306 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011307 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011308 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011309 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011310 goto overflow;
11311 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011312 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011313 if (ch == '\n' || ch == '\r')
11314 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011315 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011316 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011317 if (!found)
11318 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011319
Guido van Rossumd57fd912000-03-10 22:53:23 +000011320 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011321 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011322 if (!u)
11323 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011324 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011325
Antoine Pitroue71d5742011-10-04 15:55:09 +020011326 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011327
Antoine Pitroue71d5742011-10-04 15:55:09 +020011328 for (; i < src_len; i++) {
11329 ch = PyUnicode_READ(kind, src_data, i);
11330 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011331 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011332 incr = tabsize - (line_pos % tabsize);
11333 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011334 FILL(kind, dest_data, ' ', j, incr);
11335 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011336 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011337 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011338 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011339 line_pos++;
11340 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011341 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011342 if (ch == '\n' || ch == '\r')
11343 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011344 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011345 }
11346 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011347 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011348
Antoine Pitroue71d5742011-10-04 15:55:09 +020011349 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011350 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11351 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011352}
11353
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011354PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011355 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011356\n\
11357Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011358such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011359arguments start and end are interpreted as in slice notation.\n\
11360\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011361Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011362
11363static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011364unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011365{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011366 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011367 Py_ssize_t start;
11368 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011369 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011370
Jesus Ceaac451502011-04-20 17:09:23 +020011371 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11372 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011373 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011374
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011375 if (PyUnicode_READY(self) == -1)
11376 return NULL;
11377 if (PyUnicode_READY(substring) == -1)
11378 return NULL;
11379
Victor Stinner7931d9a2011-11-04 00:22:48 +010011380 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011381
11382 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011383
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011384 if (result == -2)
11385 return NULL;
11386
Christian Heimes217cfd12007-12-02 14:31:20 +000011387 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011388}
11389
11390static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011391unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011392{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011393 void *data;
11394 enum PyUnicode_Kind kind;
11395 Py_UCS4 ch;
11396 PyObject *res;
11397
11398 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11399 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011400 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011401 }
11402 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11403 PyErr_SetString(PyExc_IndexError, "string index out of range");
11404 return NULL;
11405 }
11406 kind = PyUnicode_KIND(self);
11407 data = PyUnicode_DATA(self);
11408 ch = PyUnicode_READ(kind, data, index);
11409 if (ch < 256)
11410 return get_latin1_char(ch);
11411
11412 res = PyUnicode_New(1, ch);
11413 if (res == NULL)
11414 return NULL;
11415 kind = PyUnicode_KIND(res);
11416 data = PyUnicode_DATA(res);
11417 PyUnicode_WRITE(kind, data, 0, ch);
11418 assert(_PyUnicode_CheckConsistency(res, 1));
11419 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011420}
11421
Guido van Rossumc2504932007-09-18 19:42:40 +000011422/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011423 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011424static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011425unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011426{
Guido van Rossumc2504932007-09-18 19:42:40 +000011427 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010011428 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011429
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011430#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011431 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011432#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011433 if (_PyUnicode_HASH(self) != -1)
11434 return _PyUnicode_HASH(self);
11435 if (PyUnicode_READY(self) == -1)
11436 return -1;
11437 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011438 /*
11439 We make the hash of the empty string be 0, rather than using
11440 (prefix ^ suffix), since this slightly obfuscates the hash secret
11441 */
11442 if (len == 0) {
11443 _PyUnicode_HASH(self) = 0;
11444 return 0;
11445 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011446
11447 /* The hash function as a macro, gets expanded three times below. */
Georg Brandl2fb477c2012-02-21 00:33:36 +010011448#define HASH(P) \
11449 x ^= (Py_uhash_t) *P << 7; \
11450 while (--len >= 0) \
11451 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011452
Georg Brandl2fb477c2012-02-21 00:33:36 +010011453 x = (Py_uhash_t) _Py_HashSecret.prefix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011454 switch (PyUnicode_KIND(self)) {
11455 case PyUnicode_1BYTE_KIND: {
11456 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11457 HASH(c);
11458 break;
11459 }
11460 case PyUnicode_2BYTE_KIND: {
11461 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11462 HASH(s);
11463 break;
11464 }
11465 default: {
11466 Py_UCS4 *l;
11467 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11468 "Impossible switch case in unicode_hash");
11469 l = PyUnicode_4BYTE_DATA(self);
11470 HASH(l);
11471 break;
11472 }
11473 }
Georg Brandl2fb477c2012-02-21 00:33:36 +010011474 x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
11475 x ^= (Py_uhash_t) _Py_HashSecret.suffix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011476
Guido van Rossumc2504932007-09-18 19:42:40 +000011477 if (x == -1)
11478 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011479 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011480 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011481}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011482#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011483
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011484PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011485 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011486\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011487Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011488
11489static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011490unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011491{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011492 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011493 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011494 Py_ssize_t start;
11495 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011496
Jesus Ceaac451502011-04-20 17:09:23 +020011497 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11498 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011499 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011500
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011501 if (PyUnicode_READY(self) == -1)
11502 return NULL;
11503 if (PyUnicode_READY(substring) == -1)
11504 return NULL;
11505
Victor Stinner7931d9a2011-11-04 00:22:48 +010011506 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011507
11508 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011509
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011510 if (result == -2)
11511 return NULL;
11512
Guido van Rossumd57fd912000-03-10 22:53:23 +000011513 if (result < 0) {
11514 PyErr_SetString(PyExc_ValueError, "substring not found");
11515 return NULL;
11516 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011517
Christian Heimes217cfd12007-12-02 14:31:20 +000011518 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011519}
11520
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011521PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011522 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011523\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011524Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011525at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011526
11527static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011528unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011529{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011530 Py_ssize_t i, length;
11531 int kind;
11532 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011533 int cased;
11534
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011535 if (PyUnicode_READY(self) == -1)
11536 return NULL;
11537 length = PyUnicode_GET_LENGTH(self);
11538 kind = PyUnicode_KIND(self);
11539 data = PyUnicode_DATA(self);
11540
Guido van Rossumd57fd912000-03-10 22:53:23 +000011541 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011542 if (length == 1)
11543 return PyBool_FromLong(
11544 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011545
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011546 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011547 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011548 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011549
Guido van Rossumd57fd912000-03-10 22:53:23 +000011550 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011551 for (i = 0; i < length; i++) {
11552 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011553
Benjamin Peterson29060642009-01-31 22:14:21 +000011554 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11555 return PyBool_FromLong(0);
11556 else if (!cased && Py_UNICODE_ISLOWER(ch))
11557 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011558 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011559 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011560}
11561
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011562PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011563 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011564\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011565Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011566at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011567
11568static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011569unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011570{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011571 Py_ssize_t i, length;
11572 int kind;
11573 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011574 int cased;
11575
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011576 if (PyUnicode_READY(self) == -1)
11577 return NULL;
11578 length = PyUnicode_GET_LENGTH(self);
11579 kind = PyUnicode_KIND(self);
11580 data = PyUnicode_DATA(self);
11581
Guido van Rossumd57fd912000-03-10 22:53:23 +000011582 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011583 if (length == 1)
11584 return PyBool_FromLong(
11585 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011586
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011587 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011588 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011589 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011590
Guido van Rossumd57fd912000-03-10 22:53:23 +000011591 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011592 for (i = 0; i < length; i++) {
11593 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011594
Benjamin Peterson29060642009-01-31 22:14:21 +000011595 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11596 return PyBool_FromLong(0);
11597 else if (!cased && Py_UNICODE_ISUPPER(ch))
11598 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011599 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011600 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011601}
11602
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011603PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011604 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011605\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011606Return True if S is a titlecased string and there is at least one\n\
11607character in S, i.e. upper- and titlecase characters may only\n\
11608follow uncased characters and lowercase characters only cased ones.\n\
11609Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011610
11611static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011612unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011613{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011614 Py_ssize_t i, length;
11615 int kind;
11616 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011617 int cased, previous_is_cased;
11618
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011619 if (PyUnicode_READY(self) == -1)
11620 return NULL;
11621 length = PyUnicode_GET_LENGTH(self);
11622 kind = PyUnicode_KIND(self);
11623 data = PyUnicode_DATA(self);
11624
Guido van Rossumd57fd912000-03-10 22:53:23 +000011625 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011626 if (length == 1) {
11627 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11628 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11629 (Py_UNICODE_ISUPPER(ch) != 0));
11630 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011631
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011632 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011633 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011634 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011635
Guido van Rossumd57fd912000-03-10 22:53:23 +000011636 cased = 0;
11637 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011638 for (i = 0; i < length; i++) {
11639 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011640
Benjamin Peterson29060642009-01-31 22:14:21 +000011641 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11642 if (previous_is_cased)
11643 return PyBool_FromLong(0);
11644 previous_is_cased = 1;
11645 cased = 1;
11646 }
11647 else if (Py_UNICODE_ISLOWER(ch)) {
11648 if (!previous_is_cased)
11649 return PyBool_FromLong(0);
11650 previous_is_cased = 1;
11651 cased = 1;
11652 }
11653 else
11654 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011655 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011656 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011657}
11658
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011659PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011660 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011661\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011662Return True if all characters in S are whitespace\n\
11663and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011664
11665static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011666unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011667{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011668 Py_ssize_t i, length;
11669 int kind;
11670 void *data;
11671
11672 if (PyUnicode_READY(self) == -1)
11673 return NULL;
11674 length = PyUnicode_GET_LENGTH(self);
11675 kind = PyUnicode_KIND(self);
11676 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011677
Guido van Rossumd57fd912000-03-10 22:53:23 +000011678 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011679 if (length == 1)
11680 return PyBool_FromLong(
11681 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011682
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011683 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011684 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011685 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011686
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011687 for (i = 0; i < length; i++) {
11688 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011689 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011690 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011691 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011692 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011693}
11694
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011695PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011696 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011697\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011698Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011699and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011700
11701static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011702unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011703{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011704 Py_ssize_t i, length;
11705 int kind;
11706 void *data;
11707
11708 if (PyUnicode_READY(self) == -1)
11709 return NULL;
11710 length = PyUnicode_GET_LENGTH(self);
11711 kind = PyUnicode_KIND(self);
11712 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011713
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011714 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011715 if (length == 1)
11716 return PyBool_FromLong(
11717 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011718
11719 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011720 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011721 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011722
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011723 for (i = 0; i < length; i++) {
11724 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011725 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011726 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011727 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011728}
11729
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011730PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011731 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011732\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011733Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011734and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011735
11736static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011737unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011738{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011739 int kind;
11740 void *data;
11741 Py_ssize_t len, i;
11742
11743 if (PyUnicode_READY(self) == -1)
11744 return NULL;
11745
11746 kind = PyUnicode_KIND(self);
11747 data = PyUnicode_DATA(self);
11748 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011749
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011750 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011751 if (len == 1) {
11752 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11753 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11754 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011755
11756 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011757 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011758 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011759
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011760 for (i = 0; i < len; i++) {
11761 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011762 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011763 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011764 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011765 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011766}
11767
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011768PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011769 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011770\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011771Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011772False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011773
11774static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011775unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011776{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011777 Py_ssize_t i, length;
11778 int kind;
11779 void *data;
11780
11781 if (PyUnicode_READY(self) == -1)
11782 return NULL;
11783 length = PyUnicode_GET_LENGTH(self);
11784 kind = PyUnicode_KIND(self);
11785 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011786
Guido van Rossumd57fd912000-03-10 22:53:23 +000011787 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011788 if (length == 1)
11789 return PyBool_FromLong(
11790 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011791
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011792 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011793 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011794 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011795
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011796 for (i = 0; i < length; i++) {
11797 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011798 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011799 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011800 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011801}
11802
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011803PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011804 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011805\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011806Return True if all characters in S are digits\n\
11807and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011808
11809static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011810unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011811{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011812 Py_ssize_t i, length;
11813 int kind;
11814 void *data;
11815
11816 if (PyUnicode_READY(self) == -1)
11817 return NULL;
11818 length = PyUnicode_GET_LENGTH(self);
11819 kind = PyUnicode_KIND(self);
11820 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011821
Guido van Rossumd57fd912000-03-10 22:53:23 +000011822 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011823 if (length == 1) {
11824 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11825 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11826 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011827
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011828 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011829 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011830 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011831
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011832 for (i = 0; i < length; i++) {
11833 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011834 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011835 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011836 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011837}
11838
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011839PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011840 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011841\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011842Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011843False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011844
11845static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011846unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011847{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011848 Py_ssize_t i, length;
11849 int kind;
11850 void *data;
11851
11852 if (PyUnicode_READY(self) == -1)
11853 return NULL;
11854 length = PyUnicode_GET_LENGTH(self);
11855 kind = PyUnicode_KIND(self);
11856 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011857
Guido van Rossumd57fd912000-03-10 22:53:23 +000011858 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011859 if (length == 1)
11860 return PyBool_FromLong(
11861 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011862
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011863 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011864 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011865 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011866
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011867 for (i = 0; i < length; i++) {
11868 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011869 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011870 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011871 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011872}
11873
Martin v. Löwis47383402007-08-15 07:32:56 +000011874int
11875PyUnicode_IsIdentifier(PyObject *self)
11876{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011877 int kind;
11878 void *data;
11879 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011880 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011881
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011882 if (PyUnicode_READY(self) == -1) {
11883 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011884 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011885 }
11886
11887 /* Special case for empty strings */
11888 if (PyUnicode_GET_LENGTH(self) == 0)
11889 return 0;
11890 kind = PyUnicode_KIND(self);
11891 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011892
11893 /* PEP 3131 says that the first character must be in
11894 XID_Start and subsequent characters in XID_Continue,
11895 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011896 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011897 letters, digits, underscore). However, given the current
11898 definition of XID_Start and XID_Continue, it is sufficient
11899 to check just for these, except that _ must be allowed
11900 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011901 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011902 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011903 return 0;
11904
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011905 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011906 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011907 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011908 return 1;
11909}
11910
11911PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011912 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011913\n\
11914Return True if S is a valid identifier according\n\
11915to the language definition.");
11916
11917static PyObject*
11918unicode_isidentifier(PyObject *self)
11919{
11920 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11921}
11922
Georg Brandl559e5d72008-06-11 18:37:52 +000011923PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011924 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011925\n\
11926Return True if all characters in S are considered\n\
11927printable in repr() or S is empty, False otherwise.");
11928
11929static PyObject*
11930unicode_isprintable(PyObject *self)
11931{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011932 Py_ssize_t i, length;
11933 int kind;
11934 void *data;
11935
11936 if (PyUnicode_READY(self) == -1)
11937 return NULL;
11938 length = PyUnicode_GET_LENGTH(self);
11939 kind = PyUnicode_KIND(self);
11940 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011941
11942 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011943 if (length == 1)
11944 return PyBool_FromLong(
11945 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011946
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011947 for (i = 0; i < length; i++) {
11948 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011949 Py_RETURN_FALSE;
11950 }
11951 }
11952 Py_RETURN_TRUE;
11953}
11954
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011955PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011956 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011957\n\
11958Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011959iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011960
11961static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011962unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011963{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011964 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011965}
11966
Martin v. Löwis18e16552006-02-15 17:27:45 +000011967static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011968unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011969{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011970 if (PyUnicode_READY(self) == -1)
11971 return -1;
11972 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011973}
11974
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011975PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011976 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011977\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011978Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011979done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011980
11981static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011982unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011983{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011984 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011985 Py_UCS4 fillchar = ' ';
11986
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011987 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011988 return NULL;
11989
Benjamin Petersonbac79492012-01-14 13:34:47 -050011990 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011991 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011992
Victor Stinnerc4b49542011-12-11 22:44:26 +010011993 if (PyUnicode_GET_LENGTH(self) >= width)
11994 return unicode_result_unchanged(self);
11995
11996 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011997}
11998
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011999PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012000 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012001\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012002Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012003
12004static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012005unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012006{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012007 if (PyUnicode_READY(self) == -1)
12008 return NULL;
12009 if (PyUnicode_IS_ASCII(self))
12010 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012011 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012012}
12013
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012014#define LEFTSTRIP 0
12015#define RIGHTSTRIP 1
12016#define BOTHSTRIP 2
12017
12018/* Arrays indexed by above */
12019static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
12020
12021#define STRIPNAME(i) (stripformat[i]+3)
12022
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012023/* externally visible for str.strip(unicode) */
12024PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012025_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012026{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012027 void *data;
12028 int kind;
12029 Py_ssize_t i, j, len;
12030 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012031
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012032 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12033 return NULL;
12034
12035 kind = PyUnicode_KIND(self);
12036 data = PyUnicode_DATA(self);
12037 len = PyUnicode_GET_LENGTH(self);
12038 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12039 PyUnicode_DATA(sepobj),
12040 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000012041
Benjamin Peterson14339b62009-01-31 16:36:08 +000012042 i = 0;
12043 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012044 while (i < len &&
12045 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012046 i++;
12047 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012048 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012049
Benjamin Peterson14339b62009-01-31 16:36:08 +000012050 j = len;
12051 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012052 do {
12053 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012054 } while (j >= i &&
12055 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000012056 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012057 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012058
Victor Stinner7931d9a2011-11-04 00:22:48 +010012059 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012060}
12061
12062PyObject*
12063PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12064{
12065 unsigned char *data;
12066 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012067 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012068
Victor Stinnerde636f32011-10-01 03:55:54 +020012069 if (PyUnicode_READY(self) == -1)
12070 return NULL;
12071
Victor Stinner684d5fd2012-05-03 02:32:34 +020012072 length = PyUnicode_GET_LENGTH(self);
12073 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012074
Victor Stinner684d5fd2012-05-03 02:32:34 +020012075 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012076 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012077
Victor Stinnerde636f32011-10-01 03:55:54 +020012078 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012079 PyErr_SetString(PyExc_IndexError, "string index out of range");
12080 return NULL;
12081 }
Victor Stinner684d5fd2012-05-03 02:32:34 +020012082 if (start >= length || end < start) {
Victor Stinner3a7f79772012-05-03 03:36:40 +020012083 Py_INCREF(unicode_empty);
12084 return unicode_empty;
Victor Stinner684d5fd2012-05-03 02:32:34 +020012085 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020012086
Victor Stinner684d5fd2012-05-03 02:32:34 +020012087 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012088 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012089 data = PyUnicode_1BYTE_DATA(self);
12090 return unicode_fromascii(data + start, length);
12091 }
12092 else {
12093 kind = PyUnicode_KIND(self);
12094 data = PyUnicode_1BYTE_DATA(self);
12095 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012096 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012097 length);
12098 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012099}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012100
12101static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012102do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012103{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012104 int kind;
12105 void *data;
12106 Py_ssize_t len, i, j;
12107
12108 if (PyUnicode_READY(self) == -1)
12109 return NULL;
12110
12111 kind = PyUnicode_KIND(self);
12112 data = PyUnicode_DATA(self);
12113 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012114
Benjamin Peterson14339b62009-01-31 16:36:08 +000012115 i = 0;
12116 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012117 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012118 i++;
12119 }
12120 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012121
Benjamin Peterson14339b62009-01-31 16:36:08 +000012122 j = len;
12123 if (striptype != LEFTSTRIP) {
12124 do {
12125 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012126 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012127 j++;
12128 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012129
Victor Stinner7931d9a2011-11-04 00:22:48 +010012130 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012131}
12132
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012133
12134static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012135do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012136{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012137 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012138
Benjamin Peterson14339b62009-01-31 16:36:08 +000012139 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
12140 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012141
Benjamin Peterson14339b62009-01-31 16:36:08 +000012142 if (sep != NULL && sep != Py_None) {
12143 if (PyUnicode_Check(sep))
12144 return _PyUnicode_XStrip(self, striptype, sep);
12145 else {
12146 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012147 "%s arg must be None or str",
12148 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012149 return NULL;
12150 }
12151 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012152
Benjamin Peterson14339b62009-01-31 16:36:08 +000012153 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012154}
12155
12156
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012157PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012158 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012159\n\
12160Return a copy of the string S with leading and trailing\n\
12161whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012162If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012163
12164static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012165unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012166{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012167 if (PyTuple_GET_SIZE(args) == 0)
12168 return do_strip(self, BOTHSTRIP); /* Common case */
12169 else
12170 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012171}
12172
12173
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012174PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012175 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012176\n\
12177Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012178If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012179
12180static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012181unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012182{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012183 if (PyTuple_GET_SIZE(args) == 0)
12184 return do_strip(self, LEFTSTRIP); /* Common case */
12185 else
12186 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012187}
12188
12189
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012190PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012191 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012192\n\
12193Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012194If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012195
12196static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012197unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012198{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012199 if (PyTuple_GET_SIZE(args) == 0)
12200 return do_strip(self, RIGHTSTRIP); /* Common case */
12201 else
12202 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012203}
12204
12205
Guido van Rossumd57fd912000-03-10 22:53:23 +000012206static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012207unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012208{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012209 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012210 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012211
Georg Brandl222de0f2009-04-12 12:01:50 +000012212 if (len < 1) {
12213 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020012214 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000012215 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012216
Victor Stinnerc4b49542011-12-11 22:44:26 +010012217 /* no repeat, return original string */
12218 if (len == 1)
12219 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012220
Benjamin Petersonbac79492012-01-14 13:34:47 -050012221 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012222 return NULL;
12223
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012224 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012225 PyErr_SetString(PyExc_OverflowError,
12226 "repeated string is too long");
12227 return NULL;
12228 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012229 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012230
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012231 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012232 if (!u)
12233 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012234 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012235
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012236 if (PyUnicode_GET_LENGTH(str) == 1) {
12237 const int kind = PyUnicode_KIND(str);
12238 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012239 if (kind == PyUnicode_1BYTE_KIND) {
12240 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012241 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012242 }
12243 else if (kind == PyUnicode_2BYTE_KIND) {
12244 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012245 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012246 ucs2[n] = fill_char;
12247 } else {
12248 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12249 assert(kind == PyUnicode_4BYTE_KIND);
12250 for (n = 0; n < len; ++n)
12251 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012252 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012253 }
12254 else {
12255 /* number of characters copied this far */
12256 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012257 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012258 char *to = (char *) PyUnicode_DATA(u);
12259 Py_MEMCPY(to, PyUnicode_DATA(str),
12260 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012261 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012262 n = (done <= nchars-done) ? done : nchars-done;
12263 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012264 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012265 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012266 }
12267
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012268 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012269 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012270}
12271
Alexander Belopolsky40018472011-02-26 01:02:56 +000012272PyObject *
12273PyUnicode_Replace(PyObject *obj,
12274 PyObject *subobj,
12275 PyObject *replobj,
12276 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012277{
12278 PyObject *self;
12279 PyObject *str1;
12280 PyObject *str2;
12281 PyObject *result;
12282
12283 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012284 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012285 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012286 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012287 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012288 Py_DECREF(self);
12289 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012290 }
12291 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012292 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012293 Py_DECREF(self);
12294 Py_DECREF(str1);
12295 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012296 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012297 if (PyUnicode_READY(self) == -1 ||
12298 PyUnicode_READY(str1) == -1 ||
12299 PyUnicode_READY(str2) == -1)
12300 result = NULL;
12301 else
12302 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012303 Py_DECREF(self);
12304 Py_DECREF(str1);
12305 Py_DECREF(str2);
12306 return result;
12307}
12308
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012309PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012310 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012311\n\
12312Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012313old replaced by new. If the optional argument count is\n\
12314given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012315
12316static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012317unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012318{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012319 PyObject *str1;
12320 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012321 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012322 PyObject *result;
12323
Martin v. Löwis18e16552006-02-15 17:27:45 +000012324 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012325 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012326 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012327 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012328 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012329 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012330 return NULL;
12331 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012332 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012333 Py_DECREF(str1);
12334 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000012335 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012336 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
12337 result = NULL;
12338 else
12339 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012340
12341 Py_DECREF(str1);
12342 Py_DECREF(str2);
12343 return result;
12344}
12345
Alexander Belopolsky40018472011-02-26 01:02:56 +000012346static PyObject *
12347unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012348{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012349 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012350 Py_ssize_t isize;
12351 Py_ssize_t osize, squote, dquote, i, o;
12352 Py_UCS4 max, quote;
12353 int ikind, okind;
12354 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012355
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012356 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012357 return NULL;
12358
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012359 isize = PyUnicode_GET_LENGTH(unicode);
12360 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012361
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012362 /* Compute length of output, quote characters, and
12363 maximum character */
12364 osize = 2; /* quotes */
12365 max = 127;
12366 squote = dquote = 0;
12367 ikind = PyUnicode_KIND(unicode);
12368 for (i = 0; i < isize; i++) {
12369 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12370 switch (ch) {
12371 case '\'': squote++; osize++; break;
12372 case '"': dquote++; osize++; break;
12373 case '\\': case '\t': case '\r': case '\n':
12374 osize += 2; break;
12375 default:
12376 /* Fast-path ASCII */
12377 if (ch < ' ' || ch == 0x7f)
12378 osize += 4; /* \xHH */
12379 else if (ch < 0x7f)
12380 osize++;
12381 else if (Py_UNICODE_ISPRINTABLE(ch)) {
12382 osize++;
12383 max = ch > max ? ch : max;
12384 }
12385 else if (ch < 0x100)
12386 osize += 4; /* \xHH */
12387 else if (ch < 0x10000)
12388 osize += 6; /* \uHHHH */
12389 else
12390 osize += 10; /* \uHHHHHHHH */
12391 }
12392 }
12393
12394 quote = '\'';
12395 if (squote) {
12396 if (dquote)
12397 /* Both squote and dquote present. Use squote,
12398 and escape them */
12399 osize += squote;
12400 else
12401 quote = '"';
12402 }
12403
12404 repr = PyUnicode_New(osize, max);
12405 if (repr == NULL)
12406 return NULL;
12407 okind = PyUnicode_KIND(repr);
12408 odata = PyUnicode_DATA(repr);
12409
12410 PyUnicode_WRITE(okind, odata, 0, quote);
12411 PyUnicode_WRITE(okind, odata, osize-1, quote);
12412
12413 for (i = 0, o = 1; i < isize; i++) {
12414 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012415
12416 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012417 if ((ch == quote) || (ch == '\\')) {
12418 PyUnicode_WRITE(okind, odata, o++, '\\');
12419 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012420 continue;
12421 }
12422
Benjamin Peterson29060642009-01-31 22:14:21 +000012423 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012424 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012425 PyUnicode_WRITE(okind, odata, o++, '\\');
12426 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012427 }
12428 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012429 PyUnicode_WRITE(okind, odata, o++, '\\');
12430 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012431 }
12432 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012433 PyUnicode_WRITE(okind, odata, o++, '\\');
12434 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012435 }
12436
12437 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012438 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012439 PyUnicode_WRITE(okind, odata, o++, '\\');
12440 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012441 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12442 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012443 }
12444
Georg Brandl559e5d72008-06-11 18:37:52 +000012445 /* Copy ASCII characters as-is */
12446 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012447 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012448 }
12449
Benjamin Peterson29060642009-01-31 22:14:21 +000012450 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000012451 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012452 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000012453 (categories Z* and C* except ASCII space)
12454 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012455 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012456 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012457 if (ch <= 0xff) {
12458 PyUnicode_WRITE(okind, odata, o++, '\\');
12459 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012460 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12461 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012462 }
12463 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012464 else if (ch >= 0x10000) {
12465 PyUnicode_WRITE(okind, odata, o++, '\\');
12466 PyUnicode_WRITE(okind, odata, o++, 'U');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012467 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12468 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12469 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12470 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12471 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12472 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12473 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12474 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012475 }
12476 /* Map 16-bit characters to '\uxxxx' */
12477 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012478 PyUnicode_WRITE(okind, odata, o++, '\\');
12479 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012480 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12481 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12482 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12483 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012484 }
12485 }
12486 /* Copy characters as-is */
12487 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012488 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012489 }
12490 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012491 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012492 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012493 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012494 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012495}
12496
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012497PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012498 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012499\n\
12500Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012501such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012502arguments start and end are interpreted as in slice notation.\n\
12503\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012504Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012505
12506static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012507unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012508{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012509 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012510 Py_ssize_t start;
12511 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012512 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012513
Jesus Ceaac451502011-04-20 17:09:23 +020012514 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12515 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012516 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012517
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012518 if (PyUnicode_READY(self) == -1)
12519 return NULL;
12520 if (PyUnicode_READY(substring) == -1)
12521 return NULL;
12522
Victor Stinner7931d9a2011-11-04 00:22:48 +010012523 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012524
12525 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012526
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012527 if (result == -2)
12528 return NULL;
12529
Christian Heimes217cfd12007-12-02 14:31:20 +000012530 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012531}
12532
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012533PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012534 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012535\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012536Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012537
12538static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012539unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012540{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012541 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012542 Py_ssize_t start;
12543 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012544 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012545
Jesus Ceaac451502011-04-20 17:09:23 +020012546 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12547 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012548 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012549
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012550 if (PyUnicode_READY(self) == -1)
12551 return NULL;
12552 if (PyUnicode_READY(substring) == -1)
12553 return NULL;
12554
Victor Stinner7931d9a2011-11-04 00:22:48 +010012555 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012556
12557 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012558
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012559 if (result == -2)
12560 return NULL;
12561
Guido van Rossumd57fd912000-03-10 22:53:23 +000012562 if (result < 0) {
12563 PyErr_SetString(PyExc_ValueError, "substring not found");
12564 return NULL;
12565 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012566
Christian Heimes217cfd12007-12-02 14:31:20 +000012567 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012568}
12569
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012570PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012571 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012572\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012573Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012574done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012575
12576static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012577unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012578{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012579 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012580 Py_UCS4 fillchar = ' ';
12581
Victor Stinnere9a29352011-10-01 02:14:59 +020012582 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012583 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012584
Benjamin Petersonbac79492012-01-14 13:34:47 -050012585 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012586 return NULL;
12587
Victor Stinnerc4b49542011-12-11 22:44:26 +010012588 if (PyUnicode_GET_LENGTH(self) >= width)
12589 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012590
Victor Stinnerc4b49542011-12-11 22:44:26 +010012591 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012592}
12593
Alexander Belopolsky40018472011-02-26 01:02:56 +000012594PyObject *
12595PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012596{
12597 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012598
Guido van Rossumd57fd912000-03-10 22:53:23 +000012599 s = PyUnicode_FromObject(s);
12600 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012601 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012602 if (sep != NULL) {
12603 sep = PyUnicode_FromObject(sep);
12604 if (sep == NULL) {
12605 Py_DECREF(s);
12606 return NULL;
12607 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012608 }
12609
Victor Stinner9310abb2011-10-05 00:59:23 +020012610 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012611
12612 Py_DECREF(s);
12613 Py_XDECREF(sep);
12614 return result;
12615}
12616
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012617PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012618 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012619\n\
12620Return a list of the words in S, using sep as the\n\
12621delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012622splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012623whitespace string is a separator and empty strings are\n\
12624removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012625
12626static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012627unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012628{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012629 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012630 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012631 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012632
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012633 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12634 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012635 return NULL;
12636
12637 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012638 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012639 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012640 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012641 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012642 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012643}
12644
Thomas Wouters477c8d52006-05-27 19:21:47 +000012645PyObject *
12646PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12647{
12648 PyObject* str_obj;
12649 PyObject* sep_obj;
12650 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012651 int kind1, kind2, kind;
12652 void *buf1 = NULL, *buf2 = NULL;
12653 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012654
12655 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012656 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012657 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012658 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012659 if (!sep_obj) {
12660 Py_DECREF(str_obj);
12661 return NULL;
12662 }
12663 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12664 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012665 Py_DECREF(str_obj);
12666 return NULL;
12667 }
12668
Victor Stinner14f8f022011-10-05 20:58:25 +020012669 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012670 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012671 kind = Py_MAX(kind1, kind2);
12672 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012673 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012674 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012675 if (!buf1)
12676 goto onError;
12677 buf2 = PyUnicode_DATA(sep_obj);
12678 if (kind2 != kind)
12679 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12680 if (!buf2)
12681 goto onError;
12682 len1 = PyUnicode_GET_LENGTH(str_obj);
12683 len2 = PyUnicode_GET_LENGTH(sep_obj);
12684
Benjamin Petersonead6b532011-12-20 17:23:42 -060012685 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012686 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012687 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12688 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12689 else
12690 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012691 break;
12692 case PyUnicode_2BYTE_KIND:
12693 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12694 break;
12695 case PyUnicode_4BYTE_KIND:
12696 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12697 break;
12698 default:
12699 assert(0);
12700 out = 0;
12701 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012702
12703 Py_DECREF(sep_obj);
12704 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012705 if (kind1 != kind)
12706 PyMem_Free(buf1);
12707 if (kind2 != kind)
12708 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012709
12710 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012711 onError:
12712 Py_DECREF(sep_obj);
12713 Py_DECREF(str_obj);
12714 if (kind1 != kind && buf1)
12715 PyMem_Free(buf1);
12716 if (kind2 != kind && buf2)
12717 PyMem_Free(buf2);
12718 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012719}
12720
12721
12722PyObject *
12723PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12724{
12725 PyObject* str_obj;
12726 PyObject* sep_obj;
12727 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012728 int kind1, kind2, kind;
12729 void *buf1 = NULL, *buf2 = NULL;
12730 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012731
12732 str_obj = PyUnicode_FromObject(str_in);
12733 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012734 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012735 sep_obj = PyUnicode_FromObject(sep_in);
12736 if (!sep_obj) {
12737 Py_DECREF(str_obj);
12738 return NULL;
12739 }
12740
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012741 kind1 = PyUnicode_KIND(str_in);
12742 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012743 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012744 buf1 = PyUnicode_DATA(str_in);
12745 if (kind1 != kind)
12746 buf1 = _PyUnicode_AsKind(str_in, kind);
12747 if (!buf1)
12748 goto onError;
12749 buf2 = PyUnicode_DATA(sep_obj);
12750 if (kind2 != kind)
12751 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12752 if (!buf2)
12753 goto onError;
12754 len1 = PyUnicode_GET_LENGTH(str_obj);
12755 len2 = PyUnicode_GET_LENGTH(sep_obj);
12756
Benjamin Petersonead6b532011-12-20 17:23:42 -060012757 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012758 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012759 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12760 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12761 else
12762 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012763 break;
12764 case PyUnicode_2BYTE_KIND:
12765 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12766 break;
12767 case PyUnicode_4BYTE_KIND:
12768 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12769 break;
12770 default:
12771 assert(0);
12772 out = 0;
12773 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012774
12775 Py_DECREF(sep_obj);
12776 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012777 if (kind1 != kind)
12778 PyMem_Free(buf1);
12779 if (kind2 != kind)
12780 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012781
12782 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012783 onError:
12784 Py_DECREF(sep_obj);
12785 Py_DECREF(str_obj);
12786 if (kind1 != kind && buf1)
12787 PyMem_Free(buf1);
12788 if (kind2 != kind && buf2)
12789 PyMem_Free(buf2);
12790 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012791}
12792
12793PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012794 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012795\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012796Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012797the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012798found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012799
12800static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012801unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012802{
Victor Stinner9310abb2011-10-05 00:59:23 +020012803 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012804}
12805
12806PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012807 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012808\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012809Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012810the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012811separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012812
12813static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012814unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012815{
Victor Stinner9310abb2011-10-05 00:59:23 +020012816 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012817}
12818
Alexander Belopolsky40018472011-02-26 01:02:56 +000012819PyObject *
12820PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012821{
12822 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012823
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012824 s = PyUnicode_FromObject(s);
12825 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012826 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012827 if (sep != NULL) {
12828 sep = PyUnicode_FromObject(sep);
12829 if (sep == NULL) {
12830 Py_DECREF(s);
12831 return NULL;
12832 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012833 }
12834
Victor Stinner9310abb2011-10-05 00:59:23 +020012835 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012836
12837 Py_DECREF(s);
12838 Py_XDECREF(sep);
12839 return result;
12840}
12841
12842PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012843 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012844\n\
12845Return a list of the words in S, using sep as the\n\
12846delimiter string, starting at the end of the string and\n\
12847working to the front. If maxsplit is given, at most maxsplit\n\
12848splits are done. If sep is not specified, any whitespace string\n\
12849is a separator.");
12850
12851static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012852unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012853{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012854 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012855 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012856 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012857
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012858 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12859 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012860 return NULL;
12861
12862 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012863 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012864 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012865 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012866 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012867 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012868}
12869
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012870PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012871 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012872\n\
12873Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012874Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012875is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012876
12877static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012878unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012879{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012880 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012881 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012882
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012883 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12884 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012885 return NULL;
12886
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012887 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012888}
12889
12890static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012891PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012892{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012893 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012894}
12895
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012896PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012897 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012898\n\
12899Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012900and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012901
12902static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012903unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012904{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012905 if (PyUnicode_READY(self) == -1)
12906 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012907 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012908}
12909
Georg Brandlceee0772007-11-27 23:48:05 +000012910PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012911 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012912\n\
12913Return a translation table usable for str.translate().\n\
12914If there is only one argument, it must be a dictionary mapping Unicode\n\
12915ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012916Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012917If there are two arguments, they must be strings of equal length, and\n\
12918in the resulting dictionary, each character in x will be mapped to the\n\
12919character at the same position in y. If there is a third argument, it\n\
12920must be a string, whose characters will be mapped to None in the result.");
12921
12922static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012923unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012924{
12925 PyObject *x, *y = NULL, *z = NULL;
12926 PyObject *new = NULL, *key, *value;
12927 Py_ssize_t i = 0;
12928 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012929
Georg Brandlceee0772007-11-27 23:48:05 +000012930 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12931 return NULL;
12932 new = PyDict_New();
12933 if (!new)
12934 return NULL;
12935 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012936 int x_kind, y_kind, z_kind;
12937 void *x_data, *y_data, *z_data;
12938
Georg Brandlceee0772007-11-27 23:48:05 +000012939 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012940 if (!PyUnicode_Check(x)) {
12941 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12942 "be a string if there is a second argument");
12943 goto err;
12944 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012945 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012946 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12947 "arguments must have equal length");
12948 goto err;
12949 }
12950 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012951 x_kind = PyUnicode_KIND(x);
12952 y_kind = PyUnicode_KIND(y);
12953 x_data = PyUnicode_DATA(x);
12954 y_data = PyUnicode_DATA(y);
12955 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12956 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012957 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012958 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012959 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012960 if (!value) {
12961 Py_DECREF(key);
12962 goto err;
12963 }
Georg Brandlceee0772007-11-27 23:48:05 +000012964 res = PyDict_SetItem(new, key, value);
12965 Py_DECREF(key);
12966 Py_DECREF(value);
12967 if (res < 0)
12968 goto err;
12969 }
12970 /* create entries for deleting chars in z */
12971 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012972 z_kind = PyUnicode_KIND(z);
12973 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012974 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012975 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012976 if (!key)
12977 goto err;
12978 res = PyDict_SetItem(new, key, Py_None);
12979 Py_DECREF(key);
12980 if (res < 0)
12981 goto err;
12982 }
12983 }
12984 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012985 int kind;
12986 void *data;
12987
Georg Brandlceee0772007-11-27 23:48:05 +000012988 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012989 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012990 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12991 "to maketrans it must be a dict");
12992 goto err;
12993 }
12994 /* copy entries into the new dict, converting string keys to int keys */
12995 while (PyDict_Next(x, &i, &key, &value)) {
12996 if (PyUnicode_Check(key)) {
12997 /* convert string keys to integer keys */
12998 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012999 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013000 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13001 "table must be of length 1");
13002 goto err;
13003 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013004 kind = PyUnicode_KIND(key);
13005 data = PyUnicode_DATA(key);
13006 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013007 if (!newkey)
13008 goto err;
13009 res = PyDict_SetItem(new, newkey, value);
13010 Py_DECREF(newkey);
13011 if (res < 0)
13012 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013013 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013014 /* just keep integer keys */
13015 if (PyDict_SetItem(new, key, value) < 0)
13016 goto err;
13017 } else {
13018 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13019 "be strings or integers");
13020 goto err;
13021 }
13022 }
13023 }
13024 return new;
13025 err:
13026 Py_DECREF(new);
13027 return NULL;
13028}
13029
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013030PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013031 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013032\n\
13033Return a copy of the string S, where all characters have been mapped\n\
13034through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000013035Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000013036Unmapped characters are left untouched. Characters mapped to None\n\
13037are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013038
13039static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013040unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013041{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013042 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013043}
13044
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013045PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013046 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013047\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013048Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013049
13050static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013051unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013052{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013053 if (PyUnicode_READY(self) == -1)
13054 return NULL;
13055 if (PyUnicode_IS_ASCII(self))
13056 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013057 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013058}
13059
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013060PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013061 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013062\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000013063Pad a numeric string S with zeros on the left, to fill a field\n\
13064of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013065
13066static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020013067unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013068{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013069 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013070 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013071 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013072 int kind;
13073 void *data;
13074 Py_UCS4 chr;
13075
Martin v. Löwis18e16552006-02-15 17:27:45 +000013076 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000013077 return NULL;
13078
Benjamin Petersonbac79492012-01-14 13:34:47 -050013079 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013080 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013081
Victor Stinnerc4b49542011-12-11 22:44:26 +010013082 if (PyUnicode_GET_LENGTH(self) >= width)
13083 return unicode_result_unchanged(self);
13084
13085 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013086
13087 u = pad(self, fill, 0, '0');
13088
Walter Dörwald068325e2002-04-15 13:36:47 +000013089 if (u == NULL)
13090 return NULL;
13091
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013092 kind = PyUnicode_KIND(u);
13093 data = PyUnicode_DATA(u);
13094 chr = PyUnicode_READ(kind, data, fill);
13095
13096 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013097 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013098 PyUnicode_WRITE(kind, data, 0, chr);
13099 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013100 }
13101
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013102 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013103 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013104}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013105
13106#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013107static PyObject *
13108unicode__decimal2ascii(PyObject *self)
13109{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013110 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013111}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013112#endif
13113
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013114PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013115 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013116\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013117Return True if S starts with the specified prefix, False otherwise.\n\
13118With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013119With optional end, stop comparing S at that position.\n\
13120prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013121
13122static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013123unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013124 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013125{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013126 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013127 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013128 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013129 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013130 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013131
Jesus Ceaac451502011-04-20 17:09:23 +020013132 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013133 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013134 if (PyTuple_Check(subobj)) {
13135 Py_ssize_t i;
13136 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013137 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013138 if (substring == NULL)
13139 return NULL;
13140 result = tailmatch(self, substring, start, end, -1);
13141 Py_DECREF(substring);
13142 if (result) {
13143 Py_RETURN_TRUE;
13144 }
13145 }
13146 /* nothing matched */
13147 Py_RETURN_FALSE;
13148 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013149 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013150 if (substring == NULL) {
13151 if (PyErr_ExceptionMatches(PyExc_TypeError))
13152 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
13153 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013154 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013155 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013156 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013157 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013158 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013159}
13160
13161
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013162PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013163 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013164\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013165Return True if S ends with the specified suffix, False otherwise.\n\
13166With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013167With optional end, stop comparing S at that position.\n\
13168suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013169
13170static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013171unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013172 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013173{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013174 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013175 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013176 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013177 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013178 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013179
Jesus Ceaac451502011-04-20 17:09:23 +020013180 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013181 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013182 if (PyTuple_Check(subobj)) {
13183 Py_ssize_t i;
13184 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013185 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000013186 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013187 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013188 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013189 result = tailmatch(self, substring, start, end, +1);
13190 Py_DECREF(substring);
13191 if (result) {
13192 Py_RETURN_TRUE;
13193 }
13194 }
13195 Py_RETURN_FALSE;
13196 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013197 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013198 if (substring == NULL) {
13199 if (PyErr_ExceptionMatches(PyExc_TypeError))
13200 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
13201 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013202 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013203 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013204 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013205 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013206 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013207}
13208
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013209#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013210
13211PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013212 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013213\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013214Return a formatted version of S, using substitutions from args and kwargs.\n\
13215The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013216
Eric Smith27bbca62010-11-04 17:06:58 +000013217PyDoc_STRVAR(format_map__doc__,
13218 "S.format_map(mapping) -> str\n\
13219\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013220Return a formatted version of S, using substitutions from mapping.\n\
13221The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013222
Eric Smith4a7d76d2008-05-30 18:10:19 +000013223static PyObject *
13224unicode__format__(PyObject* self, PyObject* args)
13225{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013226 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013227
13228 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13229 return NULL;
13230
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013231 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013232 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013233 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013234}
13235
Eric Smith8c663262007-08-25 02:26:07 +000013236PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013237 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013238\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013239Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013240
13241static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013242unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013243{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013244 Py_ssize_t size;
13245
13246 /* If it's a compact object, account for base structure +
13247 character data. */
13248 if (PyUnicode_IS_COMPACT_ASCII(v))
13249 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13250 else if (PyUnicode_IS_COMPACT(v))
13251 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013252 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013253 else {
13254 /* If it is a two-block object, account for base object, and
13255 for character block if present. */
13256 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013257 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013258 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013259 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013260 }
13261 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013262 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013263 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013264 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013265 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013266 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013267
13268 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013269}
13270
13271PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013272 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013273
13274static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013275unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013276{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013277 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013278 if (!copy)
13279 return NULL;
13280 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013281}
13282
Guido van Rossumd57fd912000-03-10 22:53:23 +000013283static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013284 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013285 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013286 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13287 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013288 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13289 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013290 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013291 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13292 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13293 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13294 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
13295 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013296 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013297 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13298 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13299 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013300 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013301 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13302 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13303 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013304 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013305 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010013306 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013307 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013308 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13309 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13310 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13311 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13312 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13313 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13314 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13315 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13316 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13317 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13318 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13319 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13320 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13321 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013322 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013323 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013324 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013325 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013326 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013327 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000013328 {"maketrans", (PyCFunction) unicode_maketrans,
13329 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013330 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013331#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013332 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013333 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013334#endif
13335
Benjamin Peterson14339b62009-01-31 16:36:08 +000013336 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013337 {NULL, NULL}
13338};
13339
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013340static PyObject *
13341unicode_mod(PyObject *v, PyObject *w)
13342{
Brian Curtindfc80e32011-08-10 20:28:54 -050013343 if (!PyUnicode_Check(v))
13344 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013345 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013346}
13347
13348static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013349 0, /*nb_add*/
13350 0, /*nb_subtract*/
13351 0, /*nb_multiply*/
13352 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013353};
13354
Guido van Rossumd57fd912000-03-10 22:53:23 +000013355static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013356 (lenfunc) unicode_length, /* sq_length */
13357 PyUnicode_Concat, /* sq_concat */
13358 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13359 (ssizeargfunc) unicode_getitem, /* sq_item */
13360 0, /* sq_slice */
13361 0, /* sq_ass_item */
13362 0, /* sq_ass_slice */
13363 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013364};
13365
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013366static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013367unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013368{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013369 if (PyUnicode_READY(self) == -1)
13370 return NULL;
13371
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013372 if (PyIndex_Check(item)) {
13373 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013374 if (i == -1 && PyErr_Occurred())
13375 return NULL;
13376 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013377 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013378 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013379 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013380 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013381 PyObject *result;
13382 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013383 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013384 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013385
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013386 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013387 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013388 return NULL;
13389 }
13390
13391 if (slicelength <= 0) {
Victor Stinner382955f2011-12-11 21:44:00 +010013392 Py_INCREF(unicode_empty);
13393 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013394 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013395 slicelength == PyUnicode_GET_LENGTH(self)) {
13396 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013397 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013398 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013399 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013400 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013401 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013402 src_kind = PyUnicode_KIND(self);
13403 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013404 if (!PyUnicode_IS_ASCII(self)) {
13405 kind_limit = kind_maxchar_limit(src_kind);
13406 max_char = 0;
13407 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13408 ch = PyUnicode_READ(src_kind, src_data, cur);
13409 if (ch > max_char) {
13410 max_char = ch;
13411 if (max_char >= kind_limit)
13412 break;
13413 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013414 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013415 }
Victor Stinner55c99112011-10-13 01:17:06 +020013416 else
13417 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013418 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013419 if (result == NULL)
13420 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013421 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013422 dest_data = PyUnicode_DATA(result);
13423
13424 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013425 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13426 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013427 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013428 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013429 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013430 } else {
13431 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13432 return NULL;
13433 }
13434}
13435
13436static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013437 (lenfunc)unicode_length, /* mp_length */
13438 (binaryfunc)unicode_subscript, /* mp_subscript */
13439 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013440};
13441
Guido van Rossumd57fd912000-03-10 22:53:23 +000013442
Guido van Rossumd57fd912000-03-10 22:53:23 +000013443/* Helpers for PyUnicode_Format() */
13444
13445static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000013446getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013447{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013448 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013449 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013450 (*p_argidx)++;
13451 if (arglen < 0)
13452 return args;
13453 else
13454 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013455 }
13456 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013457 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013458 return NULL;
13459}
13460
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013461/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013462
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013463static PyObject *
13464formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013465{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013466 char *p;
13467 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013468 double x;
Tim Petersced69f82003-09-16 20:30:58 +000013469
Guido van Rossumd57fd912000-03-10 22:53:23 +000013470 x = PyFloat_AsDouble(v);
13471 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013472 return NULL;
13473
Guido van Rossumd57fd912000-03-10 22:53:23 +000013474 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013475 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013476
Eric Smith0923d1d2009-04-16 20:16:10 +000013477 p = PyOS_double_to_string(x, type, prec,
13478 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013479 if (p == NULL)
13480 return NULL;
Victor Stinnerb5c3ea32012-05-02 00:29:36 +020013481 result = unicode_fromascii((unsigned char*)p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +000013482 PyMem_Free(p);
13483 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013484}
13485
Victor Stinnerd0880d52012-04-27 23:40:13 +020013486/* formatlong() emulates the format codes d, u, o, x and X, and
13487 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13488 * Python's regular ints.
13489 * Return value: a new PyUnicodeObject*, or NULL if error.
13490 * The output string is of the form
13491 * "-"? ("0x" | "0X")? digit+
13492 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13493 * set in flags. The case of hex digits will be correct,
13494 * There will be at least prec digits, zero-filled on the left if
13495 * necessary to get that many.
13496 * val object to be converted
13497 * flags bitmask of format flags; only F_ALT is looked at
13498 * prec minimum number of digits; 0-fill on left if needed
13499 * type a character in [duoxX]; u acts the same as d
13500 *
13501 * CAUTION: o, x and X conversions on regular ints can never
13502 * produce a '-' sign, but can for Python's unbounded ints.
13503 */
Tim Peters38fd5b62000-09-21 05:43:11 +000013504static PyObject*
13505formatlong(PyObject *val, int flags, int prec, int type)
13506{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013507 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013508 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013509 Py_ssize_t i;
13510 int sign; /* 1 if '-', else 0 */
13511 int len; /* number of characters */
13512 Py_ssize_t llen;
13513 int numdigits; /* len == numnondigits + numdigits */
13514 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000013515
Victor Stinnerd0880d52012-04-27 23:40:13 +020013516 /* Avoid exceeding SSIZE_T_MAX */
13517 if (prec > INT_MAX-3) {
13518 PyErr_SetString(PyExc_OverflowError,
13519 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013520 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013521 }
13522
13523 assert(PyLong_Check(val));
13524
13525 switch (type) {
13526 case 'd':
13527 case 'u':
13528 /* Special-case boolean: we want 0/1 */
Victor Stinnerb11d91d2012-04-28 00:25:34 +020013529 if (PyBool_Check(val))
13530 result = PyNumber_ToBase(val, 10);
13531 else
13532 result = Py_TYPE(val)->tp_str(val);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013533 break;
13534 case 'o':
13535 numnondigits = 2;
13536 result = PyNumber_ToBase(val, 8);
13537 break;
13538 case 'x':
13539 case 'X':
13540 numnondigits = 2;
13541 result = PyNumber_ToBase(val, 16);
13542 break;
13543 default:
13544 assert(!"'type' not in [duoxX]");
13545 }
13546 if (!result)
13547 return NULL;
13548
13549 assert(unicode_modifiable(result));
13550 assert(PyUnicode_IS_READY(result));
13551 assert(PyUnicode_IS_ASCII(result));
13552
13553 /* To modify the string in-place, there can only be one reference. */
13554 if (Py_REFCNT(result) != 1) {
13555 PyErr_BadInternalCall();
13556 return NULL;
13557 }
13558 buf = PyUnicode_DATA(result);
13559 llen = PyUnicode_GET_LENGTH(result);
13560 if (llen > INT_MAX) {
13561 PyErr_SetString(PyExc_ValueError,
13562 "string too large in _PyBytes_FormatLong");
13563 return NULL;
13564 }
13565 len = (int)llen;
13566 sign = buf[0] == '-';
13567 numnondigits += sign;
13568 numdigits = len - numnondigits;
13569 assert(numdigits > 0);
13570
13571 /* Get rid of base marker unless F_ALT */
13572 if (((flags & F_ALT) == 0 &&
13573 (type == 'o' || type == 'x' || type == 'X'))) {
13574 assert(buf[sign] == '0');
13575 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13576 buf[sign+1] == 'o');
13577 numnondigits -= 2;
13578 buf += 2;
13579 len -= 2;
13580 if (sign)
13581 buf[0] = '-';
13582 assert(len == numnondigits + numdigits);
13583 assert(numdigits > 0);
13584 }
13585
13586 /* Fill with leading zeroes to meet minimum width. */
13587 if (prec > numdigits) {
13588 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13589 numnondigits + prec);
13590 char *b1;
13591 if (!r1) {
13592 Py_DECREF(result);
13593 return NULL;
13594 }
13595 b1 = PyBytes_AS_STRING(r1);
13596 for (i = 0; i < numnondigits; ++i)
13597 *b1++ = *buf++;
13598 for (i = 0; i < prec - numdigits; i++)
13599 *b1++ = '0';
13600 for (i = 0; i < numdigits; i++)
13601 *b1++ = *buf++;
13602 *b1 = '\0';
13603 Py_DECREF(result);
13604 result = r1;
13605 buf = PyBytes_AS_STRING(result);
13606 len = numnondigits + prec;
13607 }
13608
13609 /* Fix up case for hex conversions. */
13610 if (type == 'X') {
13611 /* Need to convert all lower case letters to upper case.
13612 and need to convert 0x to 0X (and -0x to -0X). */
13613 for (i = 0; i < len; i++)
13614 if (buf[i] >= 'a' && buf[i] <= 'x')
13615 buf[i] -= 'a'-'A';
13616 }
13617 if (!PyUnicode_Check(result) || len != PyUnicode_GET_LENGTH(result)) {
13618 PyObject *unicode;
13619 unicode = unicode_fromascii((unsigned char *)buf, len);
13620 Py_DECREF(result);
13621 result = unicode;
13622 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013623 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013624}
13625
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013626static Py_UCS4
13627formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013628{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013629 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013630 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013631 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013632 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013633 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013634 goto onError;
13635 }
13636 else {
13637 /* Integer input truncated to a character */
13638 long x;
13639 x = PyLong_AsLong(v);
13640 if (x == -1 && PyErr_Occurred())
13641 goto onError;
13642
Victor Stinner8faf8212011-12-08 22:14:11 +010013643 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013644 PyErr_SetString(PyExc_OverflowError,
13645 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013646 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013647 }
13648
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013649 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013650 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013651
Benjamin Peterson29060642009-01-31 22:14:21 +000013652 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013653 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013654 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013655 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013656}
13657
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013658struct unicode_writer_t {
13659 PyObject *buffer;
13660 void *data;
13661 enum PyUnicode_Kind kind;
13662 Py_UCS4 maxchar;
13663 Py_ssize_t length;
13664 Py_ssize_t pos;
13665};
13666
13667Py_LOCAL_INLINE(void)
13668unicode_writer_update(struct unicode_writer_t *writer)
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013669{
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013670 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13671 writer->data = PyUnicode_DATA(writer->buffer);
13672 writer->kind = PyUnicode_KIND(writer->buffer);
13673}
13674
13675Py_LOCAL_INLINE(int)
13676unicode_writer_init(struct unicode_writer_t *writer,
13677 Py_ssize_t length, Py_UCS4 maxchar)
13678{
13679 writer->pos = 0;
13680 writer->length = length;
13681 writer->buffer = PyUnicode_New(writer->length, maxchar);
13682 if (writer->buffer == NULL)
13683 return -1;
13684 unicode_writer_update(writer);
13685 return 0;
13686}
13687
13688Py_LOCAL_INLINE(int)
13689unicode_writer_prepare(struct unicode_writer_t *writer,
13690 Py_ssize_t length, Py_UCS4 maxchar)
13691{
13692 Py_ssize_t newlen;
13693
13694 if (length > PY_SSIZE_T_MAX - writer->pos) {
13695 PyErr_NoMemory();
13696 return -1;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013697 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013698 newlen = writer->pos + length;
13699
13700 if (newlen > writer->length && maxchar > writer->maxchar) {
13701 PyObject *newbuffer;
13702
13703 /* overallocate 25% to limit the number of resize */
13704 if (newlen > PY_SSIZE_T_MAX - newlen / 4)
13705 writer->length = newlen;
13706 else
13707 writer->length = newlen + newlen / 4;
13708
13709 /* resize + widen */
13710 newbuffer = PyUnicode_New(writer->length, maxchar);
13711 if (newbuffer == NULL)
13712 return -1;
13713 PyUnicode_CopyCharacters(newbuffer, 0,
13714 writer->buffer, 0, writer->pos);
13715 Py_DECREF(writer->buffer);
13716 writer->buffer = newbuffer;
13717 unicode_writer_update(writer);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013718 return 0;
13719 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013720 if (newlen > writer->length) {
13721 /* overallocate 25% to limit the number of resize */
13722 if (newlen > PY_SSIZE_T_MAX - newlen / 4)
13723 writer->length = newlen;
13724 else
13725 writer->length = newlen + newlen / 4;
13726 if (PyUnicode_Resize(&writer->buffer, writer->length) < 0)
13727 return -1;
13728 unicode_writer_update(writer);
13729 }
13730 if (maxchar > writer->maxchar) {
13731 if (unicode_widen(&writer->buffer, writer->pos, maxchar) < 0)
13732 return -1;
13733 unicode_writer_update(writer);
13734 }
13735 return 0;
13736}
13737
13738Py_LOCAL_INLINE(int)
13739unicode_writer_write_str(
13740 struct unicode_writer_t *writer,
13741 PyObject *str, Py_ssize_t start, Py_ssize_t length)
13742{
13743 Py_UCS4 maxchar;
13744 maxchar = _PyUnicode_FindMaxChar(str, start, start + length);
13745 if (unicode_writer_prepare(writer, length, maxchar) == -1)
13746 return -1;
13747 assert((writer->pos + length) <= writer->length);
13748 copy_characters(writer->buffer, writer->pos,
13749 str, start, length);
13750 writer->pos += length;
13751 return 0;
13752}
13753
13754Py_LOCAL_INLINE(int)
13755unicode_writer_write_char(
13756 struct unicode_writer_t *writer,
13757 Py_UCS4 ch)
13758{
13759 if (unicode_writer_prepare(writer, 1, ch) == -1)
13760 return -1;
13761 assert((writer->pos + 1) <= writer->length);
13762 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13763 writer->pos += 1;
13764 return 0;
13765}
13766
13767Py_LOCAL_INLINE(void)
13768unicode_writer_dealloc(struct unicode_writer_t *writer)
13769{
13770 Py_CLEAR(writer->buffer);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013771}
13772
Alexander Belopolsky40018472011-02-26 01:02:56 +000013773PyObject *
13774PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013775{
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013776 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013777 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013778 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013779 PyObject *temp = NULL;
13780 PyObject *second = NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013781 PyObject *uformat;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013782 void *fmt;
13783 enum PyUnicode_Kind kind, fmtkind;
13784 struct unicode_writer_t writer;
Tim Petersced69f82003-09-16 20:30:58 +000013785
Guido van Rossumd57fd912000-03-10 22:53:23 +000013786 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013787 PyErr_BadInternalCall();
13788 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013789 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013790 uformat = PyUnicode_FromObject(format);
Benjamin Peterson22a29702012-01-02 09:00:30 -060013791 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013792 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060013793 if (PyUnicode_READY(uformat) == -1)
13794 Py_DECREF(uformat);
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013795
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013796 fmt = PyUnicode_DATA(uformat);
13797 fmtkind = PyUnicode_KIND(uformat);
13798 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13799 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013800
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013801 if (unicode_writer_init(&writer, fmtcnt + 100, 127) < 0)
13802 goto onError;
13803
Guido van Rossumd57fd912000-03-10 22:53:23 +000013804 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013805 arglen = PyTuple_Size(args);
13806 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013807 }
13808 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013809 arglen = -1;
13810 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013811 }
Christian Heimes90aa7642007-12-19 02:45:37 +000013812 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000013813 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013814 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013815
13816 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013817 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013818 Py_ssize_t nonfmtpos;
13819 nonfmtpos = fmtpos++;
13820 while (fmtcnt >= 0 &&
13821 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13822 fmtpos++;
13823 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013824 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013825 if (fmtcnt < 0)
13826 fmtpos--;
13827 if (unicode_writer_write_str(&writer, uformat, nonfmtpos, fmtpos - nonfmtpos) < 0)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013828 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013829 }
13830 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013831 /* Got a format specifier */
13832 int flags = 0;
13833 Py_ssize_t width = -1;
13834 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013835 Py_UCS4 c = '\0';
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013836 Py_UCS4 fill;
13837 int sign;
13838 Py_UCS4 signchar;
Benjamin Peterson29060642009-01-31 22:14:21 +000013839 int isnumok;
13840 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013841 void *pbuf = NULL;
13842 Py_ssize_t pindex, len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013843
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013844 fmtpos++;
Victor Stinner438106b2012-05-02 00:41:57 +020013845 c = PyUnicode_READ(fmtkind, fmt, fmtpos);
13846 if (c == '(') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013847 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013848 Py_ssize_t keylen;
13849 PyObject *key;
13850 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013851
Benjamin Peterson29060642009-01-31 22:14:21 +000013852 if (dict == NULL) {
13853 PyErr_SetString(PyExc_TypeError,
13854 "format requires a mapping");
13855 goto onError;
13856 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013857 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013858 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013859 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013860 /* Skip over balanced parentheses */
13861 while (pcount > 0 && --fmtcnt >= 0) {
Victor Stinnerbff7c962012-05-03 01:44:59 +020013862 c = PyUnicode_READ(fmtkind, fmt, fmtpos);
13863 if (c == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013864 --pcount;
Victor Stinnerbff7c962012-05-03 01:44:59 +020013865 else if (c == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013866 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013867 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013868 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013869 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013870 if (fmtcnt < 0 || pcount > 0) {
13871 PyErr_SetString(PyExc_ValueError,
13872 "incomplete format key");
13873 goto onError;
13874 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013875 key = PyUnicode_Substring(uformat,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013876 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013877 if (key == NULL)
13878 goto onError;
13879 if (args_owned) {
13880 Py_DECREF(args);
13881 args_owned = 0;
13882 }
13883 args = PyObject_GetItem(dict, key);
13884 Py_DECREF(key);
13885 if (args == NULL) {
13886 goto onError;
13887 }
13888 args_owned = 1;
13889 arglen = -1;
13890 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013891 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013892 while (--fmtcnt >= 0) {
Victor Stinner438106b2012-05-02 00:41:57 +020013893 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13894 switch (c) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013895 case '-': flags |= F_LJUST; continue;
13896 case '+': flags |= F_SIGN; continue;
13897 case ' ': flags |= F_BLANK; continue;
13898 case '#': flags |= F_ALT; continue;
13899 case '0': flags |= F_ZERO; continue;
13900 }
13901 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013902 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013903 if (c == '*') {
13904 v = getnextarg(args, arglen, &argidx);
13905 if (v == NULL)
13906 goto onError;
13907 if (!PyLong_Check(v)) {
13908 PyErr_SetString(PyExc_TypeError,
13909 "* wants int");
13910 goto onError;
13911 }
13912 width = PyLong_AsLong(v);
13913 if (width == -1 && PyErr_Occurred())
13914 goto onError;
13915 if (width < 0) {
13916 flags |= F_LJUST;
13917 width = -width;
13918 }
13919 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013920 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013921 }
13922 else if (c >= '0' && c <= '9') {
13923 width = c - '0';
13924 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013925 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013926 if (c < '0' || c > '9')
13927 break;
13928 if ((width*10) / 10 != width) {
13929 PyErr_SetString(PyExc_ValueError,
13930 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013931 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013932 }
13933 width = width*10 + (c - '0');
13934 }
13935 }
13936 if (c == '.') {
13937 prec = 0;
13938 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013939 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013940 if (c == '*') {
13941 v = getnextarg(args, arglen, &argidx);
13942 if (v == NULL)
13943 goto onError;
13944 if (!PyLong_Check(v)) {
13945 PyErr_SetString(PyExc_TypeError,
13946 "* wants int");
13947 goto onError;
13948 }
13949 prec = PyLong_AsLong(v);
13950 if (prec == -1 && PyErr_Occurred())
13951 goto onError;
13952 if (prec < 0)
13953 prec = 0;
13954 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013955 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013956 }
13957 else if (c >= '0' && c <= '9') {
13958 prec = c - '0';
13959 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013960 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013961 if (c < '0' || c > '9')
13962 break;
13963 if ((prec*10) / 10 != prec) {
13964 PyErr_SetString(PyExc_ValueError,
13965 "prec too big");
13966 goto onError;
13967 }
13968 prec = prec*10 + (c - '0');
13969 }
13970 }
13971 } /* prec */
13972 if (fmtcnt >= 0) {
13973 if (c == 'h' || c == 'l' || c == 'L') {
13974 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013975 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013976 }
13977 }
13978 if (fmtcnt < 0) {
13979 PyErr_SetString(PyExc_ValueError,
13980 "incomplete format");
13981 goto onError;
13982 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020013983
13984 if (c == '%') {
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013985 if (unicode_writer_write_char(&writer, '%') < 0)
13986 goto onError;
Victor Stinneraff3cc62012-04-30 05:19:21 +020013987 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013988 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020013989
13990
13991 v = getnextarg(args, arglen, &argidx);
13992 if (v == NULL)
13993 goto onError;
13994
Benjamin Peterson29060642009-01-31 22:14:21 +000013995 sign = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013996 signchar = '\0';
Benjamin Peterson29060642009-01-31 22:14:21 +000013997 fill = ' ';
13998 switch (c) {
13999
Benjamin Peterson29060642009-01-31 22:14:21 +000014000 case 's':
14001 case 'r':
14002 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000014003 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000014004 temp = v;
14005 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014006 }
14007 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000014008 if (c == 's')
14009 temp = PyObject_Str(v);
14010 else if (c == 'r')
14011 temp = PyObject_Repr(v);
14012 else
14013 temp = PyObject_ASCII(v);
Benjamin Peterson29060642009-01-31 22:14:21 +000014014 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014015 break;
14016
14017 case 'i':
14018 case 'd':
14019 case 'u':
14020 case 'o':
14021 case 'x':
14022 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000014023 isnumok = 0;
14024 if (PyNumber_Check(v)) {
14025 PyObject *iobj=NULL;
14026
14027 if (PyLong_Check(v)) {
14028 iobj = v;
14029 Py_INCREF(iobj);
14030 }
14031 else {
14032 iobj = PyNumber_Long(v);
14033 }
14034 if (iobj!=NULL) {
14035 if (PyLong_Check(iobj)) {
14036 isnumok = 1;
Victor Stinneraff3cc62012-04-30 05:19:21 +020014037 sign = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070014038 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000014039 Py_DECREF(iobj);
Benjamin Peterson29060642009-01-31 22:14:21 +000014040 }
14041 else {
14042 Py_DECREF(iobj);
14043 }
14044 }
14045 }
14046 if (!isnumok) {
14047 PyErr_Format(PyExc_TypeError,
14048 "%%%c format: a number is required, "
14049 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
14050 goto onError;
14051 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014052 if (flags & F_ZERO)
Benjamin Peterson29060642009-01-31 22:14:21 +000014053 fill = '0';
14054 break;
14055
14056 case 'e':
14057 case 'E':
14058 case 'f':
14059 case 'F':
14060 case 'g':
14061 case 'G':
Benjamin Peterson29060642009-01-31 22:14:21 +000014062 sign = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014063 if (flags & F_ZERO)
Benjamin Peterson29060642009-01-31 22:14:21 +000014064 fill = '0';
Victor Stinneraff3cc62012-04-30 05:19:21 +020014065 temp = formatfloat(v, flags, prec, c);
Benjamin Peterson29060642009-01-31 22:14:21 +000014066 break;
14067
14068 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014069 {
14070 Py_UCS4 ch = formatchar(v);
14071 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014072 goto onError;
Victor Stinnerb5c3ea32012-05-02 00:29:36 +020014073 temp = PyUnicode_FromOrdinal(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +000014074 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014075 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014076
14077 default:
14078 PyErr_Format(PyExc_ValueError,
14079 "unsupported format character '%c' (0x%x) "
14080 "at index %zd",
14081 (31<=c && c<=126) ? (char)c : '?',
14082 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014083 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000014084 goto onError;
14085 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014086 if (temp == NULL)
14087 goto onError;
14088 assert (PyUnicode_Check(temp));
14089 if (PyUnicode_READY(temp) == -1) {
14090 Py_CLEAR(temp);
14091 goto onError;
14092 }
14093 kind = PyUnicode_KIND(temp);
14094 pbuf = PyUnicode_DATA(temp);
14095 len = PyUnicode_GET_LENGTH(temp);
14096
14097 if (c == 's' || c == 'r' || c == 'a') {
14098 if (prec >= 0 && len > prec)
14099 len = prec;
14100 }
14101
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014102 /* pbuf is initialized here. */
14103 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000014104 if (sign) {
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014105 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14106 if (ch == '-' || ch == '+') {
14107 signchar = ch;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014108 len--;
14109 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000014110 }
14111 else if (flags & F_SIGN)
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014112 signchar = '+';
Benjamin Peterson29060642009-01-31 22:14:21 +000014113 else if (flags & F_BLANK)
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014114 signchar = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +000014115 else
14116 sign = 0;
14117 }
14118 if (width < len)
14119 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000014120 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014121 if (fill != ' ') {
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014122 if (unicode_writer_write_char(&writer, signchar) < 0)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014123 goto onError;
14124 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014125 if (width > len)
14126 width--;
14127 }
14128 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014129 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014130 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000014131 if (fill != ' ') {
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014132 if (unicode_writer_prepare(&writer, 2, 127) < 0)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014133 goto onError;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014134 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '0');
14135 PyUnicode_WRITE(writer.kind, writer.data, writer.pos+1, c);
14136 writer.pos += 2;
14137 pindex += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +000014138 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014139 width -= 2;
14140 if (width < 0)
14141 width = 0;
14142 len -= 2;
14143 }
14144 if (width > len && !(flags & F_LJUST)) {
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014145 Py_ssize_t sublen;
14146 sublen = width - len;
14147 if (unicode_writer_prepare(&writer, sublen, fill) < 0)
Antoine Pitrou978b9d22011-10-07 12:35:48 +020014148 goto onError;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014149 FILL(writer.kind, writer.data, fill, writer.pos, sublen);
14150 writer.pos += sublen;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020014151 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000014152 }
14153 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014154 if (sign) {
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014155 if (unicode_writer_write_char(&writer, signchar) < 0)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014156 goto onError;
14157 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014158 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014159 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14160 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014161
14162 if (unicode_writer_prepare(&writer, 2, 127) < 0)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014163 goto onError;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014164 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '0');
14165 PyUnicode_WRITE(writer.kind, writer.data, writer.pos+1, c);
14166 writer.pos += 2;
14167
14168 pindex += 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014169 }
14170 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014171
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014172 /* Copy all characters, preserving len */
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014173 if (unicode_writer_write_str(&writer, temp, pindex, len) < 0)
14174 goto onError;
14175 if (width > len) {
14176 Py_ssize_t sublen = width - len;
14177 if (unicode_writer_prepare(&writer, sublen, ' ') < 0)
Victor Stinnerb80e46e2012-04-30 05:21:52 +020014178 goto onError;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014179 FILL(writer.kind, writer.data, ' ', writer.pos, sublen);
14180 writer.pos += sublen;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014181 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014182 if (dict && (argidx < arglen) && c != '%') {
14183 PyErr_SetString(PyExc_TypeError,
14184 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000014185 goto onError;
14186 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014187 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000014188 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014189 } /* until end */
14190 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014191 PyErr_SetString(PyExc_TypeError,
14192 "not all arguments converted during string formatting");
14193 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014194 }
14195
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014196 if (PyUnicode_Resize(&writer.buffer, writer.pos) < 0)
14197 goto onError;
14198
Guido van Rossumd57fd912000-03-10 22:53:23 +000014199 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014200 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014201 }
14202 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014203 Py_XDECREF(temp);
14204 Py_XDECREF(second);
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014205 return writer.buffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014206
Benjamin Peterson29060642009-01-31 22:14:21 +000014207 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014208 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014209 Py_XDECREF(temp);
14210 Py_XDECREF(second);
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014211 unicode_writer_dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014212 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014213 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014214 }
14215 return NULL;
14216}
14217
Jeremy Hylton938ace62002-07-17 16:30:39 +000014218static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014219unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14220
Tim Peters6d6c1a32001-08-02 04:15:00 +000014221static PyObject *
14222unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14223{
Benjamin Peterson29060642009-01-31 22:14:21 +000014224 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014225 static char *kwlist[] = {"object", "encoding", "errors", 0};
14226 char *encoding = NULL;
14227 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014228
Benjamin Peterson14339b62009-01-31 16:36:08 +000014229 if (type != &PyUnicode_Type)
14230 return unicode_subtype_new(type, args, kwds);
14231 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014232 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014233 return NULL;
Victor Stinner382955f2011-12-11 21:44:00 +010014234 if (x == NULL) {
14235 Py_INCREF(unicode_empty);
14236 return unicode_empty;
14237 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014238 if (encoding == NULL && errors == NULL)
14239 return PyObject_Str(x);
14240 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014241 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014242}
14243
Guido van Rossume023fe02001-08-30 03:12:59 +000014244static PyObject *
14245unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14246{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014247 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014248 Py_ssize_t length, char_size;
14249 int share_wstr, share_utf8;
14250 unsigned int kind;
14251 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014252
Benjamin Peterson14339b62009-01-31 16:36:08 +000014253 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014254
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014255 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014256 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014257 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014258 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014259 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014260 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014261 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014262 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014263
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014264 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014265 if (self == NULL) {
14266 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014267 return NULL;
14268 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014269 kind = PyUnicode_KIND(unicode);
14270 length = PyUnicode_GET_LENGTH(unicode);
14271
14272 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014273#ifdef Py_DEBUG
14274 _PyUnicode_HASH(self) = -1;
14275#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014276 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014277#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014278 _PyUnicode_STATE(self).interned = 0;
14279 _PyUnicode_STATE(self).kind = kind;
14280 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014281 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014282 _PyUnicode_STATE(self).ready = 1;
14283 _PyUnicode_WSTR(self) = NULL;
14284 _PyUnicode_UTF8_LENGTH(self) = 0;
14285 _PyUnicode_UTF8(self) = NULL;
14286 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014287 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014288
14289 share_utf8 = 0;
14290 share_wstr = 0;
14291 if (kind == PyUnicode_1BYTE_KIND) {
14292 char_size = 1;
14293 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14294 share_utf8 = 1;
14295 }
14296 else if (kind == PyUnicode_2BYTE_KIND) {
14297 char_size = 2;
14298 if (sizeof(wchar_t) == 2)
14299 share_wstr = 1;
14300 }
14301 else {
14302 assert(kind == PyUnicode_4BYTE_KIND);
14303 char_size = 4;
14304 if (sizeof(wchar_t) == 4)
14305 share_wstr = 1;
14306 }
14307
14308 /* Ensure we won't overflow the length. */
14309 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14310 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014311 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014312 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014313 data = PyObject_MALLOC((length + 1) * char_size);
14314 if (data == NULL) {
14315 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014316 goto onError;
14317 }
14318
Victor Stinnerc3c74152011-10-02 20:39:55 +020014319 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014320 if (share_utf8) {
14321 _PyUnicode_UTF8_LENGTH(self) = length;
14322 _PyUnicode_UTF8(self) = data;
14323 }
14324 if (share_wstr) {
14325 _PyUnicode_WSTR_LENGTH(self) = length;
14326 _PyUnicode_WSTR(self) = (wchar_t *)data;
14327 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014328
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014329 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014330 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014331 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014332#ifdef Py_DEBUG
14333 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14334#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014335 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014336 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014337
14338onError:
14339 Py_DECREF(unicode);
14340 Py_DECREF(self);
14341 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014342}
14343
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014344PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000014345 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014346\n\
Collin Winterd474ce82007-08-07 19:42:11 +000014347Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000014348encoding defaults to the current default string encoding.\n\
14349errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014350
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014351static PyObject *unicode_iter(PyObject *seq);
14352
Guido van Rossumd57fd912000-03-10 22:53:23 +000014353PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014354 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014355 "str", /* tp_name */
14356 sizeof(PyUnicodeObject), /* tp_size */
14357 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014358 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014359 (destructor)unicode_dealloc, /* tp_dealloc */
14360 0, /* tp_print */
14361 0, /* tp_getattr */
14362 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014363 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014364 unicode_repr, /* tp_repr */
14365 &unicode_as_number, /* tp_as_number */
14366 &unicode_as_sequence, /* tp_as_sequence */
14367 &unicode_as_mapping, /* tp_as_mapping */
14368 (hashfunc) unicode_hash, /* tp_hash*/
14369 0, /* tp_call*/
14370 (reprfunc) unicode_str, /* tp_str */
14371 PyObject_GenericGetAttr, /* tp_getattro */
14372 0, /* tp_setattro */
14373 0, /* tp_as_buffer */
14374 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014375 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014376 unicode_doc, /* tp_doc */
14377 0, /* tp_traverse */
14378 0, /* tp_clear */
14379 PyUnicode_RichCompare, /* tp_richcompare */
14380 0, /* tp_weaklistoffset */
14381 unicode_iter, /* tp_iter */
14382 0, /* tp_iternext */
14383 unicode_methods, /* tp_methods */
14384 0, /* tp_members */
14385 0, /* tp_getset */
14386 &PyBaseObject_Type, /* tp_base */
14387 0, /* tp_dict */
14388 0, /* tp_descr_get */
14389 0, /* tp_descr_set */
14390 0, /* tp_dictoffset */
14391 0, /* tp_init */
14392 0, /* tp_alloc */
14393 unicode_new, /* tp_new */
14394 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014395};
14396
14397/* Initialize the Unicode implementation */
14398
Victor Stinner3a50e702011-10-18 21:21:00 +020014399int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014400{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014401 int i;
14402
Thomas Wouters477c8d52006-05-27 19:21:47 +000014403 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014404 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014405 0x000A, /* LINE FEED */
14406 0x000D, /* CARRIAGE RETURN */
14407 0x001C, /* FILE SEPARATOR */
14408 0x001D, /* GROUP SEPARATOR */
14409 0x001E, /* RECORD SEPARATOR */
14410 0x0085, /* NEXT LINE */
14411 0x2028, /* LINE SEPARATOR */
14412 0x2029, /* PARAGRAPH SEPARATOR */
14413 };
14414
Fred Drakee4315f52000-05-09 19:53:39 +000014415 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020014416 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014417 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014418 Py_FatalError("Can't create empty string");
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010014419 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014420
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014421 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000014422 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000014423 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014424 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014425
14426 /* initialize the linebreak bloom filter */
14427 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014428 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014429 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014430
14431 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020014432
14433#ifdef HAVE_MBCS
14434 winver.dwOSVersionInfoSize = sizeof(winver);
14435 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14436 PyErr_SetFromWindowsErr(0);
14437 return -1;
14438 }
14439#endif
14440 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014441}
14442
14443/* Finalize the Unicode implementation */
14444
Christian Heimesa156e092008-02-16 07:38:31 +000014445int
14446PyUnicode_ClearFreeList(void)
14447{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014448 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014449}
14450
Guido van Rossumd57fd912000-03-10 22:53:23 +000014451void
Thomas Wouters78890102000-07-22 19:25:51 +000014452_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014453{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014454 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014455
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000014456 Py_XDECREF(unicode_empty);
14457 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014458
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014459 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014460 if (unicode_latin1[i]) {
14461 Py_DECREF(unicode_latin1[i]);
14462 unicode_latin1[i] = NULL;
14463 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014464 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014465 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014466 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014467}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014468
Walter Dörwald16807132007-05-25 13:52:07 +000014469void
14470PyUnicode_InternInPlace(PyObject **p)
14471{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014472 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014473 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014474#ifdef Py_DEBUG
14475 assert(s != NULL);
14476 assert(_PyUnicode_CHECK(s));
14477#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014478 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014479 return;
14480#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014481 /* If it's a subclass, we don't really know what putting
14482 it in the interned dict might do. */
14483 if (!PyUnicode_CheckExact(s))
14484 return;
14485 if (PyUnicode_CHECK_INTERNED(s))
14486 return;
14487 if (interned == NULL) {
14488 interned = PyDict_New();
14489 if (interned == NULL) {
14490 PyErr_Clear(); /* Don't leave an exception */
14491 return;
14492 }
14493 }
14494 /* It might be that the GetItem call fails even
14495 though the key is present in the dictionary,
14496 namely when this happens during a stack overflow. */
14497 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014498 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014499 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014500
Benjamin Peterson29060642009-01-31 22:14:21 +000014501 if (t) {
14502 Py_INCREF(t);
14503 Py_DECREF(*p);
14504 *p = t;
14505 return;
14506 }
Walter Dörwald16807132007-05-25 13:52:07 +000014507
Benjamin Peterson14339b62009-01-31 16:36:08 +000014508 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010014509 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014510 PyErr_Clear();
14511 PyThreadState_GET()->recursion_critical = 0;
14512 return;
14513 }
14514 PyThreadState_GET()->recursion_critical = 0;
14515 /* The two references in interned are not counted by refcnt.
14516 The deallocator will take care of this */
14517 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014518 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014519}
14520
14521void
14522PyUnicode_InternImmortal(PyObject **p)
14523{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014524 PyUnicode_InternInPlace(p);
14525 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014526 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014527 Py_INCREF(*p);
14528 }
Walter Dörwald16807132007-05-25 13:52:07 +000014529}
14530
14531PyObject *
14532PyUnicode_InternFromString(const char *cp)
14533{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014534 PyObject *s = PyUnicode_FromString(cp);
14535 if (s == NULL)
14536 return NULL;
14537 PyUnicode_InternInPlace(&s);
14538 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014539}
14540
Alexander Belopolsky40018472011-02-26 01:02:56 +000014541void
14542_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014543{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014544 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014545 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014546 Py_ssize_t i, n;
14547 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014548
Benjamin Peterson14339b62009-01-31 16:36:08 +000014549 if (interned == NULL || !PyDict_Check(interned))
14550 return;
14551 keys = PyDict_Keys(interned);
14552 if (keys == NULL || !PyList_Check(keys)) {
14553 PyErr_Clear();
14554 return;
14555 }
Walter Dörwald16807132007-05-25 13:52:07 +000014556
Benjamin Peterson14339b62009-01-31 16:36:08 +000014557 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14558 detector, interned unicode strings are not forcibly deallocated;
14559 rather, we give them their stolen references back, and then clear
14560 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014561
Benjamin Peterson14339b62009-01-31 16:36:08 +000014562 n = PyList_GET_SIZE(keys);
14563 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014564 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014565 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014566 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014567 if (PyUnicode_READY(s) == -1) {
14568 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014569 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014570 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014571 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014572 case SSTATE_NOT_INTERNED:
14573 /* XXX Shouldn't happen */
14574 break;
14575 case SSTATE_INTERNED_IMMORTAL:
14576 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014577 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014578 break;
14579 case SSTATE_INTERNED_MORTAL:
14580 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014581 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014582 break;
14583 default:
14584 Py_FatalError("Inconsistent interned string state.");
14585 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014586 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014587 }
14588 fprintf(stderr, "total size of all interned strings: "
14589 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14590 "mortal/immortal\n", mortal_size, immortal_size);
14591 Py_DECREF(keys);
14592 PyDict_Clear(interned);
14593 Py_DECREF(interned);
14594 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000014595}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014596
14597
14598/********************* Unicode Iterator **************************/
14599
14600typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014601 PyObject_HEAD
14602 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014603 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014604} unicodeiterobject;
14605
14606static void
14607unicodeiter_dealloc(unicodeiterobject *it)
14608{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014609 _PyObject_GC_UNTRACK(it);
14610 Py_XDECREF(it->it_seq);
14611 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014612}
14613
14614static int
14615unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14616{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014617 Py_VISIT(it->it_seq);
14618 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014619}
14620
14621static PyObject *
14622unicodeiter_next(unicodeiterobject *it)
14623{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014624 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014625
Benjamin Peterson14339b62009-01-31 16:36:08 +000014626 assert(it != NULL);
14627 seq = it->it_seq;
14628 if (seq == NULL)
14629 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014630 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014631
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014632 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14633 int kind = PyUnicode_KIND(seq);
14634 void *data = PyUnicode_DATA(seq);
14635 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14636 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014637 if (item != NULL)
14638 ++it->it_index;
14639 return item;
14640 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014641
Benjamin Peterson14339b62009-01-31 16:36:08 +000014642 Py_DECREF(seq);
14643 it->it_seq = NULL;
14644 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014645}
14646
14647static PyObject *
14648unicodeiter_len(unicodeiterobject *it)
14649{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014650 Py_ssize_t len = 0;
14651 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014652 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014653 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014654}
14655
14656PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14657
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014658static PyObject *
14659unicodeiter_reduce(unicodeiterobject *it)
14660{
14661 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020014662 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014663 it->it_seq, it->it_index);
14664 } else {
14665 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
14666 if (u == NULL)
14667 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020014668 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014669 }
14670}
14671
14672PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14673
14674static PyObject *
14675unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
14676{
14677 Py_ssize_t index = PyLong_AsSsize_t(state);
14678 if (index == -1 && PyErr_Occurred())
14679 return NULL;
14680 if (index < 0)
14681 index = 0;
14682 it->it_index = index;
14683 Py_RETURN_NONE;
14684}
14685
14686PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14687
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014688static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014689 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014690 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014691 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
14692 reduce_doc},
14693 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
14694 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014695 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014696};
14697
14698PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014699 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14700 "str_iterator", /* tp_name */
14701 sizeof(unicodeiterobject), /* tp_basicsize */
14702 0, /* tp_itemsize */
14703 /* methods */
14704 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14705 0, /* tp_print */
14706 0, /* tp_getattr */
14707 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014708 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014709 0, /* tp_repr */
14710 0, /* tp_as_number */
14711 0, /* tp_as_sequence */
14712 0, /* tp_as_mapping */
14713 0, /* tp_hash */
14714 0, /* tp_call */
14715 0, /* tp_str */
14716 PyObject_GenericGetAttr, /* tp_getattro */
14717 0, /* tp_setattro */
14718 0, /* tp_as_buffer */
14719 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14720 0, /* tp_doc */
14721 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14722 0, /* tp_clear */
14723 0, /* tp_richcompare */
14724 0, /* tp_weaklistoffset */
14725 PyObject_SelfIter, /* tp_iter */
14726 (iternextfunc)unicodeiter_next, /* tp_iternext */
14727 unicodeiter_methods, /* tp_methods */
14728 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014729};
14730
14731static PyObject *
14732unicode_iter(PyObject *seq)
14733{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014734 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014735
Benjamin Peterson14339b62009-01-31 16:36:08 +000014736 if (!PyUnicode_Check(seq)) {
14737 PyErr_BadInternalCall();
14738 return NULL;
14739 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014740 if (PyUnicode_READY(seq) == -1)
14741 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014742 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14743 if (it == NULL)
14744 return NULL;
14745 it->it_index = 0;
14746 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014747 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014748 _PyObject_GC_TRACK(it);
14749 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014750}
14751
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014752
14753size_t
14754Py_UNICODE_strlen(const Py_UNICODE *u)
14755{
14756 int res = 0;
14757 while(*u++)
14758 res++;
14759 return res;
14760}
14761
14762Py_UNICODE*
14763Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14764{
14765 Py_UNICODE *u = s1;
14766 while ((*u++ = *s2++));
14767 return s1;
14768}
14769
14770Py_UNICODE*
14771Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14772{
14773 Py_UNICODE *u = s1;
14774 while ((*u++ = *s2++))
14775 if (n-- == 0)
14776 break;
14777 return s1;
14778}
14779
14780Py_UNICODE*
14781Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14782{
14783 Py_UNICODE *u1 = s1;
14784 u1 += Py_UNICODE_strlen(u1);
14785 Py_UNICODE_strcpy(u1, s2);
14786 return s1;
14787}
14788
14789int
14790Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14791{
14792 while (*s1 && *s2 && *s1 == *s2)
14793 s1++, s2++;
14794 if (*s1 && *s2)
14795 return (*s1 < *s2) ? -1 : +1;
14796 if (*s1)
14797 return 1;
14798 if (*s2)
14799 return -1;
14800 return 0;
14801}
14802
14803int
14804Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14805{
14806 register Py_UNICODE u1, u2;
14807 for (; n != 0; n--) {
14808 u1 = *s1;
14809 u2 = *s2;
14810 if (u1 != u2)
14811 return (u1 < u2) ? -1 : +1;
14812 if (u1 == '\0')
14813 return 0;
14814 s1++;
14815 s2++;
14816 }
14817 return 0;
14818}
14819
14820Py_UNICODE*
14821Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14822{
14823 const Py_UNICODE *p;
14824 for (p = s; *p; p++)
14825 if (*p == c)
14826 return (Py_UNICODE*)p;
14827 return NULL;
14828}
14829
14830Py_UNICODE*
14831Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14832{
14833 const Py_UNICODE *p;
14834 p = s + Py_UNICODE_strlen(s);
14835 while (p != s) {
14836 p--;
14837 if (*p == c)
14838 return (Py_UNICODE*)p;
14839 }
14840 return NULL;
14841}
Victor Stinner331ea922010-08-10 16:37:20 +000014842
Victor Stinner71133ff2010-09-01 23:43:53 +000014843Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014844PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014845{
Victor Stinner577db2c2011-10-11 22:12:48 +020014846 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014847 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014848
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014849 if (!PyUnicode_Check(unicode)) {
14850 PyErr_BadArgument();
14851 return NULL;
14852 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014853 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014854 if (u == NULL)
14855 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014856 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014857 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014858 PyErr_NoMemory();
14859 return NULL;
14860 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014861 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014862 size *= sizeof(Py_UNICODE);
14863 copy = PyMem_Malloc(size);
14864 if (copy == NULL) {
14865 PyErr_NoMemory();
14866 return NULL;
14867 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014868 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014869 return copy;
14870}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014871
Georg Brandl66c221e2010-10-14 07:04:07 +000014872/* A _string module, to export formatter_parser and formatter_field_name_split
14873 to the string.Formatter class implemented in Python. */
14874
14875static PyMethodDef _string_methods[] = {
14876 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14877 METH_O, PyDoc_STR("split the argument as a field name")},
14878 {"formatter_parser", (PyCFunction) formatter_parser,
14879 METH_O, PyDoc_STR("parse the argument as a format string")},
14880 {NULL, NULL}
14881};
14882
14883static struct PyModuleDef _string_module = {
14884 PyModuleDef_HEAD_INIT,
14885 "_string",
14886 PyDoc_STR("string helper module"),
14887 0,
14888 _string_methods,
14889 NULL,
14890 NULL,
14891 NULL,
14892 NULL
14893};
14894
14895PyMODINIT_FUNC
14896PyInit__string(void)
14897{
14898 return PyModule_Create(&_string_module);
14899}
14900
14901
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014902#ifdef __cplusplus
14903}
14904#endif