blob: e22fcfd02b126b5ea4254dc179aa9445a01c5f0c [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Endianness switches; defaults to little endian */
51
52#ifdef WORDS_BIGENDIAN
53# define BYTEORDER_IS_BIG_ENDIAN
54#else
55# define BYTEORDER_IS_LITTLE_ENDIAN
56#endif
57
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000058/* --- Globals ------------------------------------------------------------
59
60 The globals are initialized by the _PyUnicode_Init() API and should
61 not be used before calling that API.
62
63*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000064
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000065
66#ifdef __cplusplus
67extern "C" {
68#endif
69
Victor Stinner8faf8212011-12-08 22:14:11 +010070/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
71#define MAX_UNICODE 0x10ffff
72
Victor Stinner910337b2011-10-03 03:20:16 +020073#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020074# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020075#else
76# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
77#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020078
Victor Stinnere90fe6a2011-10-01 16:48:13 +020079#define _PyUnicode_UTF8(op) \
80 (((PyCompactUnicodeObject*)(op))->utf8)
81#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020082 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020083 assert(PyUnicode_IS_READY(op)), \
84 PyUnicode_IS_COMPACT_ASCII(op) ? \
85 ((char*)((PyASCIIObject*)(op) + 1)) : \
86 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020087#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020088 (((PyCompactUnicodeObject*)(op))->utf8_length)
89#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020090 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020091 assert(PyUnicode_IS_READY(op)), \
92 PyUnicode_IS_COMPACT_ASCII(op) ? \
93 ((PyASCIIObject*)(op))->length : \
94 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020095#define _PyUnicode_WSTR(op) \
96 (((PyASCIIObject*)(op))->wstr)
97#define _PyUnicode_WSTR_LENGTH(op) \
98 (((PyCompactUnicodeObject*)(op))->wstr_length)
99#define _PyUnicode_LENGTH(op) \
100 (((PyASCIIObject *)(op))->length)
101#define _PyUnicode_STATE(op) \
102 (((PyASCIIObject *)(op))->state)
103#define _PyUnicode_HASH(op) \
104 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200105#define _PyUnicode_KIND(op) \
106 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200107 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200108#define _PyUnicode_GET_LENGTH(op) \
109 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200110 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200111#define _PyUnicode_DATA_ANY(op) \
112 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200113
Victor Stinnere6abb482012-05-02 01:15:40 +0200114/* Optimized version of Py_MAX() to compute the maximum character:
115 use it when your are computing the second argument of PyUnicode_New() */
116#define MAX_MAXCHAR(maxchar1, maxchar2) \
117 ((maxchar1) | (maxchar2))
118
Victor Stinner910337b2011-10-03 03:20:16 +0200119#undef PyUnicode_READY
120#define PyUnicode_READY(op) \
121 (assert(_PyUnicode_CHECK(op)), \
122 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200123 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100124 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200125
Victor Stinnerc379ead2011-10-03 12:52:27 +0200126#define _PyUnicode_SHARE_UTF8(op) \
127 (assert(_PyUnicode_CHECK(op)), \
128 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
129 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
130#define _PyUnicode_SHARE_WSTR(op) \
131 (assert(_PyUnicode_CHECK(op)), \
132 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
133
Victor Stinner829c0ad2011-10-03 01:08:02 +0200134/* true if the Unicode object has an allocated UTF-8 memory block
135 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200136#define _PyUnicode_HAS_UTF8_MEMORY(op) \
137 (assert(_PyUnicode_CHECK(op)), \
138 (!PyUnicode_IS_COMPACT_ASCII(op) \
139 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200140 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
141
Victor Stinner03490912011-10-03 23:45:12 +0200142/* true if the Unicode object has an allocated wstr memory block
143 (not shared with other data) */
144#define _PyUnicode_HAS_WSTR_MEMORY(op) \
145 (assert(_PyUnicode_CHECK(op)), \
146 (_PyUnicode_WSTR(op) && \
147 (!PyUnicode_IS_READY(op) || \
148 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
149
Victor Stinner910337b2011-10-03 03:20:16 +0200150/* Generic helper macro to convert characters of different types.
151 from_type and to_type have to be valid type names, begin and end
152 are pointers to the source characters which should be of type
153 "from_type *". to is a pointer of type "to_type *" and points to the
154 buffer where the result characters are written to. */
155#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
156 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200157 to_type *_to = (to_type *) to; \
158 const from_type *_iter = (begin); \
159 const from_type *_end = (end); \
160 Py_ssize_t n = (_end) - (_iter); \
161 const from_type *_unrolled_end = \
162 _iter + (n & ~ (Py_ssize_t) 3); \
163 while (_iter < (_unrolled_end)) { \
164 _to[0] = (to_type) _iter[0]; \
165 _to[1] = (to_type) _iter[1]; \
166 _to[2] = (to_type) _iter[2]; \
167 _to[3] = (to_type) _iter[3]; \
168 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200169 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200170 while (_iter < (_end)) \
171 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200172 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200173
Walter Dörwald16807132007-05-25 13:52:07 +0000174/* This dictionary holds all interned unicode strings. Note that references
175 to strings in this dictionary are *not* counted in the string's ob_refcnt.
176 When the interned string reaches a refcnt of 0 the string deallocation
177 function will delete the reference from this dictionary.
178
179 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000180 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000181*/
182static PyObject *interned;
183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200185static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000186
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200187/* List of static strings. */
188static _Py_Identifier *static_strings;
189
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190/* Single character Unicode strings in the Latin-1 range are being
191 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200192static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193
Christian Heimes190d79e2008-01-30 11:58:22 +0000194/* Fast detection of the most frequent whitespace characters */
195const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000196 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000197/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000198/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000199/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000200/* case 0x000C: * FORM FEED */
201/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000202 0, 1, 1, 1, 1, 1, 0, 0,
203 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000204/* case 0x001C: * FILE SEPARATOR */
205/* case 0x001D: * GROUP SEPARATOR */
206/* case 0x001E: * RECORD SEPARATOR */
207/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000208 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000209/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 1, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000214
Benjamin Peterson14339b62009-01-31 16:36:08 +0000215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
218 0, 0, 0, 0, 0, 0, 0, 0,
219 0, 0, 0, 0, 0, 0, 0, 0,
220 0, 0, 0, 0, 0, 0, 0, 0,
221 0, 0, 0, 0, 0, 0, 0, 0,
222 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000223};
224
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200225/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200226static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200227static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200228static void copy_characters(
229 PyObject *to, Py_ssize_t to_start,
230 PyObject *from, Py_ssize_t from_start,
231 Py_ssize_t how_many);
Victor Stinner488fa492011-12-12 00:01:39 +0100232static int unicode_modifiable(PyObject *unicode);
233
Victor Stinnerfe226c02011-10-03 03:52:20 +0200234
Alexander Belopolsky40018472011-02-26 01:02:56 +0000235static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200236unicode_fromascii(const unsigned char *s, Py_ssize_t size);
237static PyObject *
238_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
239static PyObject *
240_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
241static PyObject *
242_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
243
244static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000245unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000246 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100247 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000248 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
249
Alexander Belopolsky40018472011-02-26 01:02:56 +0000250static void
251raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300252 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100253 PyObject *unicode,
254 Py_ssize_t startpos, Py_ssize_t endpos,
255 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000256
Christian Heimes190d79e2008-01-30 11:58:22 +0000257/* Same for linebreaks */
258static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000259 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000260/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000261/* 0x000B, * LINE TABULATION */
262/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000263/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000264 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000265 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000266/* 0x001C, * FILE SEPARATOR */
267/* 0x001D, * GROUP SEPARATOR */
268/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000269 0, 0, 0, 0, 1, 1, 1, 0,
270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000274
Benjamin Peterson14339b62009-01-31 16:36:08 +0000275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
277 0, 0, 0, 0, 0, 0, 0, 0,
278 0, 0, 0, 0, 0, 0, 0, 0,
279 0, 0, 0, 0, 0, 0, 0, 0,
280 0, 0, 0, 0, 0, 0, 0, 0,
281 0, 0, 0, 0, 0, 0, 0, 0,
282 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000283};
284
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300285/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
286 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000287Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000288PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000289{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000290#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000291 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000292#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000293 /* This is actually an illegal character, so it should
294 not be passed to unichr. */
295 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000296#endif
297}
298
Victor Stinner910337b2011-10-03 03:20:16 +0200299#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200300int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100301_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200302{
303 PyASCIIObject *ascii;
304 unsigned int kind;
305
306 assert(PyUnicode_Check(op));
307
308 ascii = (PyASCIIObject *)op;
309 kind = ascii->state.kind;
310
Victor Stinnera3b334d2011-10-03 13:53:37 +0200311 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200312 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200313 assert(ascii->state.ready == 1);
314 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200315 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200316 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200317 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200318
Victor Stinnera41463c2011-10-04 01:05:08 +0200319 if (ascii->state.compact == 1) {
320 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200321 assert(kind == PyUnicode_1BYTE_KIND
322 || kind == PyUnicode_2BYTE_KIND
323 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200324 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200325 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200326 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100327 }
328 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200329 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
330
331 data = unicode->data.any;
332 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100333 assert(ascii->length == 0);
334 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200335 assert(ascii->state.compact == 0);
336 assert(ascii->state.ascii == 0);
337 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100338 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200339 assert(ascii->wstr != NULL);
340 assert(data == NULL);
341 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200342 }
343 else {
344 assert(kind == PyUnicode_1BYTE_KIND
345 || kind == PyUnicode_2BYTE_KIND
346 || kind == PyUnicode_4BYTE_KIND);
347 assert(ascii->state.compact == 0);
348 assert(ascii->state.ready == 1);
349 assert(data != NULL);
350 if (ascii->state.ascii) {
351 assert (compact->utf8 == data);
352 assert (compact->utf8_length == ascii->length);
353 }
354 else
355 assert (compact->utf8 != data);
356 }
357 }
358 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200359 if (
360#if SIZEOF_WCHAR_T == 2
361 kind == PyUnicode_2BYTE_KIND
362#else
363 kind == PyUnicode_4BYTE_KIND
364#endif
365 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200366 {
367 assert(ascii->wstr == data);
368 assert(compact->wstr_length == ascii->length);
369 } else
370 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200371 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200372
373 if (compact->utf8 == NULL)
374 assert(compact->utf8_length == 0);
375 if (ascii->wstr == NULL)
376 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200377 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200378 /* check that the best kind is used */
379 if (check_content && kind != PyUnicode_WCHAR_KIND)
380 {
381 Py_ssize_t i;
382 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200383 void *data;
384 Py_UCS4 ch;
385
386 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200387 for (i=0; i < ascii->length; i++)
388 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200389 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200390 if (ch > maxchar)
391 maxchar = ch;
392 }
393 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100394 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200395 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100396 assert(maxchar <= 255);
397 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200398 else
399 assert(maxchar < 128);
400 }
Victor Stinner77faf692011-11-20 18:56:05 +0100401 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200402 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100403 assert(maxchar <= 0xFFFF);
404 }
405 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200406 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100407 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100408 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200409 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200410 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400411 return 1;
412}
Victor Stinner910337b2011-10-03 03:20:16 +0200413#endif
414
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100415static PyObject*
416unicode_result_wchar(PyObject *unicode)
417{
418#ifndef Py_DEBUG
419 Py_ssize_t len;
420
421 assert(Py_REFCNT(unicode) == 1);
422
423 len = _PyUnicode_WSTR_LENGTH(unicode);
424 if (len == 0) {
425 Py_INCREF(unicode_empty);
426 Py_DECREF(unicode);
427 return unicode_empty;
428 }
429
430 if (len == 1) {
431 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
432 if (ch < 256) {
433 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
434 Py_DECREF(unicode);
435 return latin1_char;
436 }
437 }
438
439 if (_PyUnicode_Ready(unicode) < 0) {
440 Py_XDECREF(unicode);
441 return NULL;
442 }
443#else
444 /* don't make the result ready in debug mode to ensure that the caller
445 makes the string ready before using it */
446 assert(_PyUnicode_CheckConsistency(unicode, 1));
447#endif
448 return unicode;
449}
450
451static PyObject*
452unicode_result_ready(PyObject *unicode)
453{
454 Py_ssize_t length;
455
456 length = PyUnicode_GET_LENGTH(unicode);
457 if (length == 0) {
458 if (unicode != unicode_empty) {
459 Py_INCREF(unicode_empty);
460 Py_DECREF(unicode);
461 }
462 return unicode_empty;
463 }
464
465 if (length == 1) {
466 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
467 if (ch < 256) {
468 PyObject *latin1_char = unicode_latin1[ch];
469 if (latin1_char != NULL) {
470 if (unicode != latin1_char) {
471 Py_INCREF(latin1_char);
472 Py_DECREF(unicode);
473 }
474 return latin1_char;
475 }
476 else {
477 assert(_PyUnicode_CheckConsistency(unicode, 1));
478 Py_INCREF(unicode);
479 unicode_latin1[ch] = unicode;
480 return unicode;
481 }
482 }
483 }
484
485 assert(_PyUnicode_CheckConsistency(unicode, 1));
486 return unicode;
487}
488
489static PyObject*
490unicode_result(PyObject *unicode)
491{
492 assert(_PyUnicode_CHECK(unicode));
493 if (PyUnicode_IS_READY(unicode))
494 return unicode_result_ready(unicode);
495 else
496 return unicode_result_wchar(unicode);
497}
498
Victor Stinnerc4b49542011-12-11 22:44:26 +0100499static PyObject*
500unicode_result_unchanged(PyObject *unicode)
501{
502 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500503 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100504 return NULL;
505 Py_INCREF(unicode);
506 return unicode;
507 }
508 else
509 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100510 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100511}
512
Victor Stinner3a50e702011-10-18 21:21:00 +0200513#ifdef HAVE_MBCS
514static OSVERSIONINFOEX winver;
515#endif
516
Thomas Wouters477c8d52006-05-27 19:21:47 +0000517/* --- Bloom Filters ----------------------------------------------------- */
518
519/* stuff to implement simple "bloom filters" for Unicode characters.
520 to keep things simple, we use a single bitmask, using the least 5
521 bits from each unicode characters as the bit index. */
522
523/* the linebreak mask is set up by Unicode_Init below */
524
Antoine Pitrouf068f942010-01-13 14:19:12 +0000525#if LONG_BIT >= 128
526#define BLOOM_WIDTH 128
527#elif LONG_BIT >= 64
528#define BLOOM_WIDTH 64
529#elif LONG_BIT >= 32
530#define BLOOM_WIDTH 32
531#else
532#error "LONG_BIT is smaller than 32"
533#endif
534
Thomas Wouters477c8d52006-05-27 19:21:47 +0000535#define BLOOM_MASK unsigned long
536
537static BLOOM_MASK bloom_linebreak;
538
Antoine Pitrouf068f942010-01-13 14:19:12 +0000539#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
540#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000541
Benjamin Peterson29060642009-01-31 22:14:21 +0000542#define BLOOM_LINEBREAK(ch) \
543 ((ch) < 128U ? ascii_linebreak[(ch)] : \
544 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000545
Alexander Belopolsky40018472011-02-26 01:02:56 +0000546Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200547make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000548{
549 /* calculate simple bloom-style bitmask for a given unicode string */
550
Antoine Pitrouf068f942010-01-13 14:19:12 +0000551 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000552 Py_ssize_t i;
553
554 mask = 0;
555 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200556 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000557
558 return mask;
559}
560
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200561#define BLOOM_MEMBER(mask, chr, str) \
562 (BLOOM(mask, chr) \
563 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000564
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200565/* Compilation of templated routines */
566
567#include "stringlib/asciilib.h"
568#include "stringlib/fastsearch.h"
569#include "stringlib/partition.h"
570#include "stringlib/split.h"
571#include "stringlib/count.h"
572#include "stringlib/find.h"
573#include "stringlib/find_max_char.h"
574#include "stringlib/localeutil.h"
575#include "stringlib/undef.h"
576
577#include "stringlib/ucs1lib.h"
578#include "stringlib/fastsearch.h"
579#include "stringlib/partition.h"
580#include "stringlib/split.h"
581#include "stringlib/count.h"
582#include "stringlib/find.h"
583#include "stringlib/find_max_char.h"
584#include "stringlib/localeutil.h"
585#include "stringlib/undef.h"
586
587#include "stringlib/ucs2lib.h"
588#include "stringlib/fastsearch.h"
589#include "stringlib/partition.h"
590#include "stringlib/split.h"
591#include "stringlib/count.h"
592#include "stringlib/find.h"
593#include "stringlib/find_max_char.h"
594#include "stringlib/localeutil.h"
595#include "stringlib/undef.h"
596
597#include "stringlib/ucs4lib.h"
598#include "stringlib/fastsearch.h"
599#include "stringlib/partition.h"
600#include "stringlib/split.h"
601#include "stringlib/count.h"
602#include "stringlib/find.h"
603#include "stringlib/find_max_char.h"
604#include "stringlib/localeutil.h"
605#include "stringlib/undef.h"
606
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200607#include "stringlib/unicodedefs.h"
608#include "stringlib/fastsearch.h"
609#include "stringlib/count.h"
610#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100611#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200612
Guido van Rossumd57fd912000-03-10 22:53:23 +0000613/* --- Unicode Object ----------------------------------------------------- */
614
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200615static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200616fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200617
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200618Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
619 Py_ssize_t size, Py_UCS4 ch,
620 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200621{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200622 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
623
624 switch (kind) {
625 case PyUnicode_1BYTE_KIND:
626 {
627 Py_UCS1 ch1 = (Py_UCS1) ch;
628 if (ch1 == ch)
629 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
630 else
631 return -1;
632 }
633 case PyUnicode_2BYTE_KIND:
634 {
635 Py_UCS2 ch2 = (Py_UCS2) ch;
636 if (ch2 == ch)
637 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
638 else
639 return -1;
640 }
641 case PyUnicode_4BYTE_KIND:
642 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
643 default:
644 assert(0);
645 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200646 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200647}
648
Victor Stinnerfe226c02011-10-03 03:52:20 +0200649static PyObject*
650resize_compact(PyObject *unicode, Py_ssize_t length)
651{
652 Py_ssize_t char_size;
653 Py_ssize_t struct_size;
654 Py_ssize_t new_size;
655 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100656 PyObject *new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200657 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100658 assert(PyUnicode_IS_COMPACT(unicode));
659
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200660 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100661 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200662 struct_size = sizeof(PyASCIIObject);
663 else
664 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200665 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200666
Victor Stinnerfe226c02011-10-03 03:52:20 +0200667 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
668 PyErr_NoMemory();
669 return NULL;
670 }
671 new_size = (struct_size + (length + 1) * char_size);
672
Victor Stinner84def372011-12-11 20:04:56 +0100673 _Py_DEC_REFTOTAL;
674 _Py_ForgetReference(unicode);
675
676 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
677 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100678 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200679 PyErr_NoMemory();
680 return NULL;
681 }
Victor Stinner84def372011-12-11 20:04:56 +0100682 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200683 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100684
Victor Stinnerfe226c02011-10-03 03:52:20 +0200685 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200686 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200687 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100688 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200689 _PyUnicode_WSTR_LENGTH(unicode) = length;
690 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200691 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
692 length, 0);
693 return unicode;
694}
695
Alexander Belopolsky40018472011-02-26 01:02:56 +0000696static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200697resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000698{
Victor Stinner95663112011-10-04 01:03:50 +0200699 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100700 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200701 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200702 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000703
Victor Stinnerfe226c02011-10-03 03:52:20 +0200704 if (PyUnicode_IS_READY(unicode)) {
705 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200706 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200707 void *data;
708
709 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200710 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200711 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
712 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200713
714 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
715 PyErr_NoMemory();
716 return -1;
717 }
718 new_size = (length + 1) * char_size;
719
Victor Stinner7a9105a2011-12-12 00:13:42 +0100720 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
721 {
722 PyObject_DEL(_PyUnicode_UTF8(unicode));
723 _PyUnicode_UTF8(unicode) = NULL;
724 _PyUnicode_UTF8_LENGTH(unicode) = 0;
725 }
726
Victor Stinnerfe226c02011-10-03 03:52:20 +0200727 data = (PyObject *)PyObject_REALLOC(data, new_size);
728 if (data == NULL) {
729 PyErr_NoMemory();
730 return -1;
731 }
732 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200733 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200734 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200735 _PyUnicode_WSTR_LENGTH(unicode) = length;
736 }
737 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200738 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200739 _PyUnicode_UTF8_LENGTH(unicode) = length;
740 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200741 _PyUnicode_LENGTH(unicode) = length;
742 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200743 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200744 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200745 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200746 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200747 }
Victor Stinner95663112011-10-04 01:03:50 +0200748 assert(_PyUnicode_WSTR(unicode) != NULL);
749
750 /* check for integer overflow */
751 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
752 PyErr_NoMemory();
753 return -1;
754 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100755 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200756 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100757 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200758 if (!wstr) {
759 PyErr_NoMemory();
760 return -1;
761 }
762 _PyUnicode_WSTR(unicode) = wstr;
763 _PyUnicode_WSTR(unicode)[length] = 0;
764 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200765 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000766 return 0;
767}
768
Victor Stinnerfe226c02011-10-03 03:52:20 +0200769static PyObject*
770resize_copy(PyObject *unicode, Py_ssize_t length)
771{
772 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100773 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200774 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100775
Benjamin Petersonbac79492012-01-14 13:34:47 -0500776 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100777 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200778
779 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
780 if (copy == NULL)
781 return NULL;
782
783 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200784 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200785 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200786 }
787 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200788 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100789
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200790 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200791 if (w == NULL)
792 return NULL;
793 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
794 copy_length = Py_MIN(copy_length, length);
795 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
796 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200797 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200798 }
799}
800
Guido van Rossumd57fd912000-03-10 22:53:23 +0000801/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000802 Ux0000 terminated; some code (e.g. new_identifier)
803 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000804
805 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000806 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000807
808*/
809
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200810#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200811static int unicode_old_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200812#endif
813
Alexander Belopolsky40018472011-02-26 01:02:56 +0000814static PyUnicodeObject *
815_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000816{
817 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200818 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000819
Thomas Wouters477c8d52006-05-27 19:21:47 +0000820 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000821 if (length == 0 && unicode_empty != NULL) {
822 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200823 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000824 }
825
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000826 /* Ensure we won't overflow the size. */
827 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
828 return (PyUnicodeObject *)PyErr_NoMemory();
829 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200830 if (length < 0) {
831 PyErr_SetString(PyExc_SystemError,
832 "Negative size passed to _PyUnicode_New");
833 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000834 }
835
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200836#ifdef Py_DEBUG
837 ++unicode_old_new_calls;
838#endif
839
840 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
841 if (unicode == NULL)
842 return NULL;
843 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
844 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
845 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100846 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000847 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100848 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000849 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200850
Jeremy Hyltond8082792003-09-16 19:41:39 +0000851 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000852 * the caller fails before initializing str -- unicode_resize()
853 * reads str[0], and the Keep-Alive optimization can keep memory
854 * allocated for str alive across a call to unicode_dealloc(unicode).
855 * We don't want unicode_resize to read uninitialized memory in
856 * that case.
857 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200858 _PyUnicode_WSTR(unicode)[0] = 0;
859 _PyUnicode_WSTR(unicode)[length] = 0;
860 _PyUnicode_WSTR_LENGTH(unicode) = length;
861 _PyUnicode_HASH(unicode) = -1;
862 _PyUnicode_STATE(unicode).interned = 0;
863 _PyUnicode_STATE(unicode).kind = 0;
864 _PyUnicode_STATE(unicode).compact = 0;
865 _PyUnicode_STATE(unicode).ready = 0;
866 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200867 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200868 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200869 _PyUnicode_UTF8(unicode) = NULL;
870 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100871 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000872 return unicode;
873}
874
Victor Stinnerf42dc442011-10-02 23:33:16 +0200875static const char*
876unicode_kind_name(PyObject *unicode)
877{
Victor Stinner42dfd712011-10-03 14:41:45 +0200878 /* don't check consistency: unicode_kind_name() is called from
879 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200880 if (!PyUnicode_IS_COMPACT(unicode))
881 {
882 if (!PyUnicode_IS_READY(unicode))
883 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600884 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200885 {
886 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200887 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200888 return "legacy ascii";
889 else
890 return "legacy latin1";
891 case PyUnicode_2BYTE_KIND:
892 return "legacy UCS2";
893 case PyUnicode_4BYTE_KIND:
894 return "legacy UCS4";
895 default:
896 return "<legacy invalid kind>";
897 }
898 }
899 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600900 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200901 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200902 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200903 return "ascii";
904 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200905 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200906 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200907 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200908 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200909 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200910 default:
911 return "<invalid compact kind>";
912 }
913}
914
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200915#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200916static int unicode_new_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200917
918/* Functions wrapping macros for use in debugger */
919char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200920 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200921}
922
923void *_PyUnicode_compact_data(void *unicode) {
924 return _PyUnicode_COMPACT_DATA(unicode);
925}
926void *_PyUnicode_data(void *unicode){
927 printf("obj %p\n", unicode);
928 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
929 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
930 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
931 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
932 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
933 return PyUnicode_DATA(unicode);
934}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200935
936void
937_PyUnicode_Dump(PyObject *op)
938{
939 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200940 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
941 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
942 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200943
Victor Stinnera849a4b2011-10-03 12:12:11 +0200944 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200945 {
946 if (ascii->state.ascii)
947 data = (ascii + 1);
948 else
949 data = (compact + 1);
950 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200951 else
952 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200953 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
954
Victor Stinnera849a4b2011-10-03 12:12:11 +0200955 if (ascii->wstr == data)
956 printf("shared ");
957 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200958
Victor Stinnera3b334d2011-10-03 13:53:37 +0200959 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200960 printf(" (%zu), ", compact->wstr_length);
961 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
962 printf("shared ");
963 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200964 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200965 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200966}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200967#endif
968
969PyObject *
970PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
971{
972 PyObject *obj;
973 PyCompactUnicodeObject *unicode;
974 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +0200975 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200976 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200977 Py_ssize_t char_size;
978 Py_ssize_t struct_size;
979
980 /* Optimization for empty strings */
981 if (size == 0 && unicode_empty != NULL) {
982 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200983 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200984 }
985
986#ifdef Py_DEBUG
987 ++unicode_new_new_calls;
988#endif
989
Victor Stinner9e9d6892011-10-04 01:02:02 +0200990 is_ascii = 0;
991 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200992 struct_size = sizeof(PyCompactUnicodeObject);
993 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +0200994 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200995 char_size = 1;
996 is_ascii = 1;
997 struct_size = sizeof(PyASCIIObject);
998 }
999 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001000 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001001 char_size = 1;
1002 }
1003 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001004 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001005 char_size = 2;
1006 if (sizeof(wchar_t) == 2)
1007 is_sharing = 1;
1008 }
1009 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001010 if (maxchar > MAX_UNICODE) {
1011 PyErr_SetString(PyExc_SystemError,
1012 "invalid maximum character passed to PyUnicode_New");
1013 return NULL;
1014 }
Victor Stinner8f825062012-04-27 13:55:39 +02001015 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001016 char_size = 4;
1017 if (sizeof(wchar_t) == 4)
1018 is_sharing = 1;
1019 }
1020
1021 /* Ensure we won't overflow the size. */
1022 if (size < 0) {
1023 PyErr_SetString(PyExc_SystemError,
1024 "Negative size passed to PyUnicode_New");
1025 return NULL;
1026 }
1027 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1028 return PyErr_NoMemory();
1029
1030 /* Duplicated allocation code from _PyObject_New() instead of a call to
1031 * PyObject_New() so we are able to allocate space for the object and
1032 * it's data buffer.
1033 */
1034 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1035 if (obj == NULL)
1036 return PyErr_NoMemory();
1037 obj = PyObject_INIT(obj, &PyUnicode_Type);
1038 if (obj == NULL)
1039 return NULL;
1040
1041 unicode = (PyCompactUnicodeObject *)obj;
1042 if (is_ascii)
1043 data = ((PyASCIIObject*)obj) + 1;
1044 else
1045 data = unicode + 1;
1046 _PyUnicode_LENGTH(unicode) = size;
1047 _PyUnicode_HASH(unicode) = -1;
1048 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001049 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001050 _PyUnicode_STATE(unicode).compact = 1;
1051 _PyUnicode_STATE(unicode).ready = 1;
1052 _PyUnicode_STATE(unicode).ascii = is_ascii;
1053 if (is_ascii) {
1054 ((char*)data)[size] = 0;
1055 _PyUnicode_WSTR(unicode) = NULL;
1056 }
Victor Stinner8f825062012-04-27 13:55:39 +02001057 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001058 ((char*)data)[size] = 0;
1059 _PyUnicode_WSTR(unicode) = NULL;
1060 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001061 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001062 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001063 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001064 else {
1065 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001066 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001067 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001068 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001069 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001070 ((Py_UCS4*)data)[size] = 0;
1071 if (is_sharing) {
1072 _PyUnicode_WSTR_LENGTH(unicode) = size;
1073 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1074 }
1075 else {
1076 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1077 _PyUnicode_WSTR(unicode) = NULL;
1078 }
1079 }
Victor Stinner8f825062012-04-27 13:55:39 +02001080#ifdef Py_DEBUG
1081 /* Fill the data with invalid characters to detect bugs earlier.
1082 _PyUnicode_CheckConsistency(str, 1) detects invalid characters,
1083 at least for ASCII and UCS-4 strings. U+00FF is invalid in ASCII
1084 and U+FFFFFFFF is an invalid character in Unicode 6.0. */
1085 memset(data, 0xff, size * kind);
1086#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001087 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001088 return obj;
1089}
1090
1091#if SIZEOF_WCHAR_T == 2
1092/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1093 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001094 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001095
1096 This function assumes that unicode can hold one more code point than wstr
1097 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001098static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001099unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001100 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001101{
1102 const wchar_t *iter;
1103 Py_UCS4 *ucs4_out;
1104
Victor Stinner910337b2011-10-03 03:20:16 +02001105 assert(unicode != NULL);
1106 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001107 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1108 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1109
1110 for (iter = begin; iter < end; ) {
1111 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1112 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001113 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1114 && (iter+1) < end
1115 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001116 {
Victor Stinner551ac952011-11-29 22:58:13 +01001117 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001118 iter += 2;
1119 }
1120 else {
1121 *ucs4_out++ = *iter;
1122 iter++;
1123 }
1124 }
1125 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1126 _PyUnicode_GET_LENGTH(unicode)));
1127
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001128}
1129#endif
1130
Victor Stinnercd9950f2011-10-02 00:34:53 +02001131static int
Victor Stinner488fa492011-12-12 00:01:39 +01001132unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001133{
Victor Stinner488fa492011-12-12 00:01:39 +01001134 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001135 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001136 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001137 return -1;
1138 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001139 return 0;
1140}
1141
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001142static int
1143_copy_characters(PyObject *to, Py_ssize_t to_start,
1144 PyObject *from, Py_ssize_t from_start,
1145 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001146{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001147 unsigned int from_kind, to_kind;
1148 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001149 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001150
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001151 assert(PyUnicode_Check(from));
1152 assert(PyUnicode_Check(to));
1153 assert(PyUnicode_IS_READY(from));
1154 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001155
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001156 assert(PyUnicode_GET_LENGTH(from) >= how_many);
1157 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1158 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001159
Victor Stinnerf5ca1a22011-09-28 23:54:59 +02001160 if (how_many == 0)
1161 return 0;
1162
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001163 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001164 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001165 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001166 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001167
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001168#ifdef Py_DEBUG
1169 if (!check_maxchar
1170 && (from_kind > to_kind
1171 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001172 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001173 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1174 Py_UCS4 ch;
1175 Py_ssize_t i;
1176 for (i=0; i < how_many; i++) {
1177 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1178 assert(ch <= to_maxchar);
1179 }
1180 }
1181#endif
1182 fast = (from_kind == to_kind);
1183 if (check_maxchar
1184 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1185 {
1186 /* deny latin1 => ascii */
1187 fast = 0;
1188 }
1189
1190 if (fast) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001191 Py_MEMCPY((char*)to_data + to_kind * to_start,
1192 (char*)from_data + from_kind * from_start,
1193 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001194 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001195 else if (from_kind == PyUnicode_1BYTE_KIND
1196 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001197 {
1198 _PyUnicode_CONVERT_BYTES(
1199 Py_UCS1, Py_UCS2,
1200 PyUnicode_1BYTE_DATA(from) + from_start,
1201 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1202 PyUnicode_2BYTE_DATA(to) + to_start
1203 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001204 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001205 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001206 && to_kind == PyUnicode_4BYTE_KIND)
1207 {
1208 _PyUnicode_CONVERT_BYTES(
1209 Py_UCS1, Py_UCS4,
1210 PyUnicode_1BYTE_DATA(from) + from_start,
1211 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1212 PyUnicode_4BYTE_DATA(to) + to_start
1213 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001214 }
1215 else if (from_kind == PyUnicode_2BYTE_KIND
1216 && to_kind == PyUnicode_4BYTE_KIND)
1217 {
1218 _PyUnicode_CONVERT_BYTES(
1219 Py_UCS2, Py_UCS4,
1220 PyUnicode_2BYTE_DATA(from) + from_start,
1221 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1222 PyUnicode_4BYTE_DATA(to) + to_start
1223 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001224 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001225 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001226 /* check if max_char(from substring) <= max_char(to) */
1227 if (from_kind > to_kind
1228 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001229 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001230 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001231 /* slow path to check for character overflow */
1232 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001233 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001234 Py_ssize_t i;
1235
Victor Stinner56c161a2011-10-06 02:47:11 +02001236#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001237 for (i=0; i < how_many; i++) {
1238 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001239 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001240 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1241 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001242#else
1243 if (!check_maxchar) {
1244 for (i=0; i < how_many; i++) {
1245 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1246 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1247 }
1248 }
1249 else {
1250 for (i=0; i < how_many; i++) {
1251 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1252 if (ch > to_maxchar)
1253 return 1;
1254 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1255 }
1256 }
1257#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001258 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001259 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001260 assert(0 && "inconsistent state");
1261 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001262 }
1263 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001264 return 0;
1265}
1266
1267static void
1268copy_characters(PyObject *to, Py_ssize_t to_start,
1269 PyObject *from, Py_ssize_t from_start,
1270 Py_ssize_t how_many)
1271{
1272 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1273}
1274
1275Py_ssize_t
1276PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1277 PyObject *from, Py_ssize_t from_start,
1278 Py_ssize_t how_many)
1279{
1280 int err;
1281
1282 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1283 PyErr_BadInternalCall();
1284 return -1;
1285 }
1286
Benjamin Petersonbac79492012-01-14 13:34:47 -05001287 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001288 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001289 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001290 return -1;
1291
1292 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1293 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1294 PyErr_Format(PyExc_SystemError,
1295 "Cannot write %zi characters at %zi "
1296 "in a string of %zi characters",
1297 how_many, to_start, PyUnicode_GET_LENGTH(to));
1298 return -1;
1299 }
1300
1301 if (how_many == 0)
1302 return 0;
1303
Victor Stinner488fa492011-12-12 00:01:39 +01001304 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001305 return -1;
1306
1307 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1308 if (err) {
1309 PyErr_Format(PyExc_SystemError,
1310 "Cannot copy %s characters "
1311 "into a string of %s characters",
1312 unicode_kind_name(from),
1313 unicode_kind_name(to));
1314 return -1;
1315 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001316 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001317}
1318
Victor Stinner17222162011-09-28 22:15:37 +02001319/* Find the maximum code point and count the number of surrogate pairs so a
1320 correct string length can be computed before converting a string to UCS4.
1321 This function counts single surrogates as a character and not as a pair.
1322
1323 Return 0 on success, or -1 on error. */
1324static int
1325find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1326 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001327{
1328 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001329 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001330
Victor Stinnerc53be962011-10-02 21:33:54 +02001331 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001332 *num_surrogates = 0;
1333 *maxchar = 0;
1334
1335 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001336#if SIZEOF_WCHAR_T == 2
Victor Stinnerca4f2072011-11-22 03:38:40 +01001337 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1338 && (iter+1) < end
1339 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001340 {
Victor Stinner8faf8212011-12-08 22:14:11 +01001341 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001342 ++(*num_surrogates);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001343 iter += 2;
1344 }
1345 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001346#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001347 {
1348 ch = *iter;
1349 iter++;
1350 }
1351 if (ch > *maxchar) {
1352 *maxchar = ch;
1353 if (*maxchar > MAX_UNICODE) {
1354 PyErr_Format(PyExc_ValueError,
1355 "character U+%x is not in range [U+0000; U+10ffff]",
1356 ch);
1357 return -1;
1358 }
1359 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001360 }
1361 return 0;
1362}
1363
1364#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001365static int unicode_ready_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001366#endif
1367
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001368int
1369_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001370{
1371 wchar_t *end;
1372 Py_UCS4 maxchar = 0;
1373 Py_ssize_t num_surrogates;
1374#if SIZEOF_WCHAR_T == 2
1375 Py_ssize_t length_wo_surrogates;
1376#endif
1377
Georg Brandl7597add2011-10-05 16:36:47 +02001378 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001379 strings were created using _PyObject_New() and where no canonical
1380 representation (the str field) has been set yet aka strings
1381 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001382 assert(_PyUnicode_CHECK(unicode));
1383 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001384 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001385 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001386 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001387 /* Actually, it should neither be interned nor be anything else: */
1388 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001389
1390#ifdef Py_DEBUG
1391 ++unicode_ready_calls;
1392#endif
1393
1394 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001395 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001396 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001397 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001398
1399 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001400 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1401 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001402 PyErr_NoMemory();
1403 return -1;
1404 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001405 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001406 _PyUnicode_WSTR(unicode), end,
1407 PyUnicode_1BYTE_DATA(unicode));
1408 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1409 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1410 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1411 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001412 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001413 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001414 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001415 }
1416 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001417 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001418 _PyUnicode_UTF8(unicode) = NULL;
1419 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001420 }
1421 PyObject_FREE(_PyUnicode_WSTR(unicode));
1422 _PyUnicode_WSTR(unicode) = NULL;
1423 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1424 }
1425 /* In this case we might have to convert down from 4-byte native
1426 wchar_t to 2-byte unicode. */
1427 else if (maxchar < 65536) {
1428 assert(num_surrogates == 0 &&
1429 "FindMaxCharAndNumSurrogatePairs() messed up");
1430
Victor Stinner506f5922011-09-28 22:34:18 +02001431#if SIZEOF_WCHAR_T == 2
1432 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001433 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001434 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1435 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1436 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001437 _PyUnicode_UTF8(unicode) = NULL;
1438 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001439#else
1440 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001441 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001442 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001443 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001444 PyErr_NoMemory();
1445 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001446 }
Victor Stinner506f5922011-09-28 22:34:18 +02001447 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1448 _PyUnicode_WSTR(unicode), end,
1449 PyUnicode_2BYTE_DATA(unicode));
1450 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1451 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1452 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001453 _PyUnicode_UTF8(unicode) = NULL;
1454 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001455 PyObject_FREE(_PyUnicode_WSTR(unicode));
1456 _PyUnicode_WSTR(unicode) = NULL;
1457 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1458#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001459 }
1460 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1461 else {
1462#if SIZEOF_WCHAR_T == 2
1463 /* in case the native representation is 2-bytes, we need to allocate a
1464 new normalized 4-byte version. */
1465 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001466 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1467 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001468 PyErr_NoMemory();
1469 return -1;
1470 }
1471 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1472 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001473 _PyUnicode_UTF8(unicode) = NULL;
1474 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001475 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1476 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001477 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001478 PyObject_FREE(_PyUnicode_WSTR(unicode));
1479 _PyUnicode_WSTR(unicode) = NULL;
1480 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1481#else
1482 assert(num_surrogates == 0);
1483
Victor Stinnerc3c74152011-10-02 20:39:55 +02001484 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001485 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001486 _PyUnicode_UTF8(unicode) = NULL;
1487 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001488 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1489#endif
1490 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1491 }
1492 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001493 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001494 return 0;
1495}
1496
Alexander Belopolsky40018472011-02-26 01:02:56 +00001497static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001498unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001499{
Walter Dörwald16807132007-05-25 13:52:07 +00001500 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001501 case SSTATE_NOT_INTERNED:
1502 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001503
Benjamin Peterson29060642009-01-31 22:14:21 +00001504 case SSTATE_INTERNED_MORTAL:
1505 /* revive dead object temporarily for DelItem */
1506 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001507 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001508 Py_FatalError(
1509 "deletion of interned string failed");
1510 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001511
Benjamin Peterson29060642009-01-31 22:14:21 +00001512 case SSTATE_INTERNED_IMMORTAL:
1513 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001514
Benjamin Peterson29060642009-01-31 22:14:21 +00001515 default:
1516 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001517 }
1518
Victor Stinner03490912011-10-03 23:45:12 +02001519 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001520 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001521 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001522 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001523 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1524 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001525
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001526 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001527}
1528
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001529#ifdef Py_DEBUG
1530static int
1531unicode_is_singleton(PyObject *unicode)
1532{
1533 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1534 if (unicode == unicode_empty)
1535 return 1;
1536 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1537 {
1538 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1539 if (ch < 256 && unicode_latin1[ch] == unicode)
1540 return 1;
1541 }
1542 return 0;
1543}
1544#endif
1545
Alexander Belopolsky40018472011-02-26 01:02:56 +00001546static int
Victor Stinner488fa492011-12-12 00:01:39 +01001547unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001548{
Victor Stinner488fa492011-12-12 00:01:39 +01001549 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001550 if (Py_REFCNT(unicode) != 1)
1551 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001552 if (_PyUnicode_HASH(unicode) != -1)
1553 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001554 if (PyUnicode_CHECK_INTERNED(unicode))
1555 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001556 if (!PyUnicode_CheckExact(unicode))
1557 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001558#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001559 /* singleton refcount is greater than 1 */
1560 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001561#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001562 return 1;
1563}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001564
Victor Stinnerfe226c02011-10-03 03:52:20 +02001565static int
1566unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1567{
1568 PyObject *unicode;
1569 Py_ssize_t old_length;
1570
1571 assert(p_unicode != NULL);
1572 unicode = *p_unicode;
1573
1574 assert(unicode != NULL);
1575 assert(PyUnicode_Check(unicode));
1576 assert(0 <= length);
1577
Victor Stinner910337b2011-10-03 03:20:16 +02001578 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001579 old_length = PyUnicode_WSTR_LENGTH(unicode);
1580 else
1581 old_length = PyUnicode_GET_LENGTH(unicode);
1582 if (old_length == length)
1583 return 0;
1584
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001585 if (length == 0) {
1586 Py_DECREF(*p_unicode);
1587 *p_unicode = unicode_empty;
1588 Py_INCREF(*p_unicode);
1589 return 0;
1590 }
1591
Victor Stinner488fa492011-12-12 00:01:39 +01001592 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001593 PyObject *copy = resize_copy(unicode, length);
1594 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001595 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001596 Py_DECREF(*p_unicode);
1597 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001598 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001599 }
1600
Victor Stinnerfe226c02011-10-03 03:52:20 +02001601 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001602 PyObject *new_unicode = resize_compact(unicode, length);
1603 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001604 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001605 *p_unicode = new_unicode;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001606 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001607 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001608 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001609 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001610}
1611
Alexander Belopolsky40018472011-02-26 01:02:56 +00001612int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001613PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001614{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001615 PyObject *unicode;
1616 if (p_unicode == NULL) {
1617 PyErr_BadInternalCall();
1618 return -1;
1619 }
1620 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001621 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001622 {
1623 PyErr_BadInternalCall();
1624 return -1;
1625 }
1626 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001627}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001628
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001629static int
Victor Stinner0a045ef2011-11-09 00:02:42 +01001630unicode_widen(PyObject **p_unicode, unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001631{
1632 PyObject *result;
1633 assert(PyUnicode_IS_READY(*p_unicode));
1634 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1635 return 0;
1636 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1637 maxchar);
1638 if (result == NULL)
1639 return -1;
1640 PyUnicode_CopyCharacters(result, 0, *p_unicode, 0,
1641 PyUnicode_GET_LENGTH(*p_unicode));
1642 Py_DECREF(*p_unicode);
1643 *p_unicode = result;
1644 return 0;
1645}
1646
1647static int
1648unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1649 Py_UCS4 ch)
1650{
Victor Stinner15e9ed22012-02-22 13:36:20 +01001651 assert(ch <= MAX_UNICODE);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001652 if (unicode_widen(p_unicode, ch) < 0)
1653 return -1;
1654 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1655 PyUnicode_DATA(*p_unicode),
1656 (*pos)++, ch);
1657 return 0;
1658}
1659
Victor Stinnerc5166102012-02-22 13:55:02 +01001660/* Copy a ASCII or latin1 char* string into a Python Unicode string.
1661 Return the length of the input string.
1662
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001663 WARNING: The function doesn't copy the terminating null character and
1664 doesn't check the maximum character (may write a latin1 character in an
1665 ASCII string). */
Victor Stinnerc5166102012-02-22 13:55:02 +01001666static Py_ssize_t
1667unicode_write_cstr(PyObject *unicode, Py_ssize_t index, const char *str)
1668{
1669 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1670 void *data = PyUnicode_DATA(unicode);
1671
1672 switch (kind) {
1673 case PyUnicode_1BYTE_KIND: {
1674 Py_ssize_t len = strlen(str);
1675 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001676 memcpy((char *) data + index, str, len);
Victor Stinnerc5166102012-02-22 13:55:02 +01001677 return len;
1678 }
1679 case PyUnicode_2BYTE_KIND: {
1680 Py_UCS2 *start = (Py_UCS2 *)data + index;
1681 Py_UCS2 *ucs2 = start;
1682 assert(index <= PyUnicode_GET_LENGTH(unicode));
1683
1684 for (; *str; ++ucs2, ++str)
1685 *ucs2 = (Py_UCS2)*str;
1686
1687 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
1688 return ucs2 - start;
1689 }
1690 default: {
1691 Py_UCS4 *start = (Py_UCS4 *)data + index;
1692 Py_UCS4 *ucs4 = start;
1693 assert(kind == PyUnicode_4BYTE_KIND);
1694 assert(index <= PyUnicode_GET_LENGTH(unicode));
1695
1696 for (; *str; ++ucs4, ++str)
1697 *ucs4 = (Py_UCS4)*str;
1698
1699 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
1700 return ucs4 - start;
1701 }
1702 }
1703}
1704
1705
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001706static PyObject*
1707get_latin1_char(unsigned char ch)
1708{
Victor Stinnera464fc12011-10-02 20:39:30 +02001709 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001710 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001711 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001712 if (!unicode)
1713 return NULL;
1714 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001715 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001716 unicode_latin1[ch] = unicode;
1717 }
1718 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001719 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001720}
1721
Alexander Belopolsky40018472011-02-26 01:02:56 +00001722PyObject *
1723PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001724{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001725 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001726 Py_UCS4 maxchar = 0;
1727 Py_ssize_t num_surrogates;
1728
1729 if (u == NULL)
1730 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001731
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001732 /* If the Unicode data is known at construction time, we can apply
1733 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001734
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001735 /* Optimization for empty strings */
1736 if (size == 0 && unicode_empty != NULL) {
1737 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001738 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001739 }
Tim Petersced69f82003-09-16 20:30:58 +00001740
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001741 /* Single character Unicode objects in the Latin-1 range are
1742 shared when using this constructor */
1743 if (size == 1 && *u < 256)
1744 return get_latin1_char((unsigned char)*u);
1745
1746 /* If not empty and not single character, copy the Unicode data
1747 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001748 if (find_maxchar_surrogates(u, u + size,
1749 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001750 return NULL;
1751
Victor Stinner8faf8212011-12-08 22:14:11 +01001752 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001753 if (!unicode)
1754 return NULL;
1755
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001756 switch (PyUnicode_KIND(unicode)) {
1757 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001758 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001759 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1760 break;
1761 case PyUnicode_2BYTE_KIND:
1762#if Py_UNICODE_SIZE == 2
1763 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1764#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001765 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001766 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1767#endif
1768 break;
1769 case PyUnicode_4BYTE_KIND:
1770#if SIZEOF_WCHAR_T == 2
1771 /* This is the only case which has to process surrogates, thus
1772 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001773 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001774#else
1775 assert(num_surrogates == 0);
1776 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1777#endif
1778 break;
1779 default:
1780 assert(0 && "Impossible state");
1781 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001782
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001783 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001784}
1785
Alexander Belopolsky40018472011-02-26 01:02:56 +00001786PyObject *
1787PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001788{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001789 if (size < 0) {
1790 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001791 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001792 return NULL;
1793 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001794 if (u != NULL)
1795 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1796 else
1797 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001798}
1799
Alexander Belopolsky40018472011-02-26 01:02:56 +00001800PyObject *
1801PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001802{
1803 size_t size = strlen(u);
1804 if (size > PY_SSIZE_T_MAX) {
1805 PyErr_SetString(PyExc_OverflowError, "input too long");
1806 return NULL;
1807 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001808 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001809}
1810
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001811PyObject *
1812_PyUnicode_FromId(_Py_Identifier *id)
1813{
1814 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001815 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1816 strlen(id->string),
1817 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001818 if (!id->object)
1819 return NULL;
1820 PyUnicode_InternInPlace(&id->object);
1821 assert(!id->next);
1822 id->next = static_strings;
1823 static_strings = id;
1824 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001825 return id->object;
1826}
1827
1828void
1829_PyUnicode_ClearStaticStrings()
1830{
1831 _Py_Identifier *i;
1832 for (i = static_strings; i; i = i->next) {
1833 Py_DECREF(i->object);
1834 i->object = NULL;
1835 i->next = NULL;
1836 }
1837}
1838
Benjamin Peterson0df54292012-03-26 14:50:32 -04001839/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001840
Victor Stinnere57b1c02011-09-28 22:20:48 +02001841static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001842unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001843{
Victor Stinner785938e2011-12-11 20:09:03 +01001844 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001845 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001846#ifdef Py_DEBUG
Victor Stinnere6b2d442011-12-11 21:54:30 +01001847 assert(s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001848#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001849 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001850 }
Victor Stinner785938e2011-12-11 20:09:03 +01001851 unicode = PyUnicode_New(size, 127);
1852 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001853 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001854 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1855 assert(_PyUnicode_CheckConsistency(unicode, 1));
1856 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001857}
1858
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001859static Py_UCS4
1860kind_maxchar_limit(unsigned int kind)
1861{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001862 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001863 case PyUnicode_1BYTE_KIND:
1864 return 0x80;
1865 case PyUnicode_2BYTE_KIND:
1866 return 0x100;
1867 case PyUnicode_4BYTE_KIND:
1868 return 0x10000;
1869 default:
1870 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001871 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001872 }
1873}
1874
Victor Stinnere6abb482012-05-02 01:15:40 +02001875Py_LOCAL_INLINE(Py_UCS4)
1876align_maxchar(Py_UCS4 maxchar)
1877{
1878 if (maxchar <= 127)
1879 return 127;
1880 else if (maxchar <= 255)
1881 return 255;
1882 else if (maxchar <= 65535)
1883 return 65535;
1884 else
1885 return MAX_UNICODE;
1886}
1887
Victor Stinner702c7342011-10-05 13:50:52 +02001888static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001889_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001890{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001891 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001892 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001893
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001894 if (size == 0) {
1895 Py_INCREF(unicode_empty);
1896 return unicode_empty;
1897 }
1898 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001899 if (size == 1)
1900 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001901
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001902 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001903 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001904 if (!res)
1905 return NULL;
1906 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001907 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001908 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001909}
1910
Victor Stinnere57b1c02011-09-28 22:20:48 +02001911static PyObject*
1912_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001913{
1914 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001915 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001916
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001917 if (size == 0) {
1918 Py_INCREF(unicode_empty);
1919 return unicode_empty;
1920 }
1921 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001922 if (size == 1 && u[0] < 256)
Victor Stinner4e101002011-10-11 23:27:52 +02001923 return get_latin1_char((unsigned char)u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001924
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001925 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001926 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001927 if (!res)
1928 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001929 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001930 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001931 else {
1932 _PyUnicode_CONVERT_BYTES(
1933 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1934 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001935 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001936 return res;
1937}
1938
Victor Stinnere57b1c02011-09-28 22:20:48 +02001939static PyObject*
1940_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001941{
1942 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001943 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001944
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001945 if (size == 0) {
1946 Py_INCREF(unicode_empty);
1947 return unicode_empty;
1948 }
1949 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001950 if (size == 1 && u[0] < 256)
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001951 return get_latin1_char((unsigned char)u[0]);
1952
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001953 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001954 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001955 if (!res)
1956 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001957 if (max_char < 256)
1958 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1959 PyUnicode_1BYTE_DATA(res));
1960 else if (max_char < 0x10000)
1961 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1962 PyUnicode_2BYTE_DATA(res));
1963 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001964 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001965 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001966 return res;
1967}
1968
1969PyObject*
1970PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1971{
Victor Stinnercfed46e2011-11-22 01:29:14 +01001972 if (size < 0) {
1973 PyErr_SetString(PyExc_ValueError, "size must be positive");
1974 return NULL;
1975 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06001976 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001977 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001978 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001979 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001980 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001981 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001982 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001983 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02001984 PyErr_SetString(PyExc_SystemError, "invalid kind");
1985 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001986 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001987}
1988
Victor Stinnerece58de2012-04-23 23:36:38 +02001989Py_UCS4
1990_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
1991{
1992 enum PyUnicode_Kind kind;
1993 void *startptr, *endptr;
1994
1995 assert(PyUnicode_IS_READY(unicode));
1996 assert(0 <= start);
1997 assert(end <= PyUnicode_GET_LENGTH(unicode));
1998 assert(start <= end);
1999
2000 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2001 return PyUnicode_MAX_CHAR_VALUE(unicode);
2002
2003 if (start == end)
2004 return 127;
2005
Victor Stinner94d558b2012-04-27 22:26:58 +02002006 if (PyUnicode_IS_ASCII(unicode))
2007 return 127;
2008
Victor Stinnerece58de2012-04-23 23:36:38 +02002009 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002010 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002011 endptr = (char *)startptr + end * kind;
2012 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002013 switch(kind) {
2014 case PyUnicode_1BYTE_KIND:
2015 return ucs1lib_find_max_char(startptr, endptr);
2016 case PyUnicode_2BYTE_KIND:
2017 return ucs2lib_find_max_char(startptr, endptr);
2018 case PyUnicode_4BYTE_KIND:
2019 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002020 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002021 assert(0);
2022 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002023 }
2024}
2025
Victor Stinner25a4b292011-10-06 12:31:55 +02002026/* Ensure that a string uses the most efficient storage, if it is not the
2027 case: create a new string with of the right kind. Write NULL into *p_unicode
2028 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002029static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002030unicode_adjust_maxchar(PyObject **p_unicode)
2031{
2032 PyObject *unicode, *copy;
2033 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002034 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002035 unsigned int kind;
2036
2037 assert(p_unicode != NULL);
2038 unicode = *p_unicode;
2039 assert(PyUnicode_IS_READY(unicode));
2040 if (PyUnicode_IS_ASCII(unicode))
2041 return;
2042
2043 len = PyUnicode_GET_LENGTH(unicode);
2044 kind = PyUnicode_KIND(unicode);
2045 if (kind == PyUnicode_1BYTE_KIND) {
2046 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002047 max_char = ucs1lib_find_max_char(u, u + len);
2048 if (max_char >= 128)
2049 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002050 }
2051 else if (kind == PyUnicode_2BYTE_KIND) {
2052 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002053 max_char = ucs2lib_find_max_char(u, u + len);
2054 if (max_char >= 256)
2055 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002056 }
2057 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002058 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002059 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002060 max_char = ucs4lib_find_max_char(u, u + len);
2061 if (max_char >= 0x10000)
2062 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002063 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002064 copy = PyUnicode_New(len, max_char);
2065 copy_characters(copy, 0, unicode, 0, len);
2066 Py_DECREF(unicode);
2067 *p_unicode = copy;
2068}
2069
Victor Stinner034f6cf2011-09-30 02:26:44 +02002070PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002071_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002072{
Victor Stinner87af4f22011-11-21 23:03:47 +01002073 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002074 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002075
Victor Stinner034f6cf2011-09-30 02:26:44 +02002076 if (!PyUnicode_Check(unicode)) {
2077 PyErr_BadInternalCall();
2078 return NULL;
2079 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002080 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002081 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002082
Victor Stinner87af4f22011-11-21 23:03:47 +01002083 length = PyUnicode_GET_LENGTH(unicode);
2084 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002085 if (!copy)
2086 return NULL;
2087 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2088
Victor Stinner87af4f22011-11-21 23:03:47 +01002089 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2090 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002091 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002092 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002093}
2094
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002095
Victor Stinnerbc603d12011-10-02 01:00:40 +02002096/* Widen Unicode objects to larger buffers. Don't write terminating null
2097 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002098
2099void*
2100_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2101{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002102 Py_ssize_t len;
2103 void *result;
2104 unsigned int skind;
2105
Benjamin Petersonbac79492012-01-14 13:34:47 -05002106 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002107 return NULL;
2108
2109 len = PyUnicode_GET_LENGTH(s);
2110 skind = PyUnicode_KIND(s);
2111 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002112 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002113 return NULL;
2114 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002115 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002116 case PyUnicode_2BYTE_KIND:
2117 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2118 if (!result)
2119 return PyErr_NoMemory();
2120 assert(skind == PyUnicode_1BYTE_KIND);
2121 _PyUnicode_CONVERT_BYTES(
2122 Py_UCS1, Py_UCS2,
2123 PyUnicode_1BYTE_DATA(s),
2124 PyUnicode_1BYTE_DATA(s) + len,
2125 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002126 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002127 case PyUnicode_4BYTE_KIND:
2128 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2129 if (!result)
2130 return PyErr_NoMemory();
2131 if (skind == PyUnicode_2BYTE_KIND) {
2132 _PyUnicode_CONVERT_BYTES(
2133 Py_UCS2, Py_UCS4,
2134 PyUnicode_2BYTE_DATA(s),
2135 PyUnicode_2BYTE_DATA(s) + len,
2136 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002137 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002138 else {
2139 assert(skind == PyUnicode_1BYTE_KIND);
2140 _PyUnicode_CONVERT_BYTES(
2141 Py_UCS1, Py_UCS4,
2142 PyUnicode_1BYTE_DATA(s),
2143 PyUnicode_1BYTE_DATA(s) + len,
2144 result);
2145 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002146 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002147 default:
2148 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002149 }
Victor Stinner01698042011-10-04 00:04:26 +02002150 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002151 return NULL;
2152}
2153
2154static Py_UCS4*
2155as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2156 int copy_null)
2157{
2158 int kind;
2159 void *data;
2160 Py_ssize_t len, targetlen;
2161 if (PyUnicode_READY(string) == -1)
2162 return NULL;
2163 kind = PyUnicode_KIND(string);
2164 data = PyUnicode_DATA(string);
2165 len = PyUnicode_GET_LENGTH(string);
2166 targetlen = len;
2167 if (copy_null)
2168 targetlen++;
2169 if (!target) {
2170 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2171 PyErr_NoMemory();
2172 return NULL;
2173 }
2174 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2175 if (!target) {
2176 PyErr_NoMemory();
2177 return NULL;
2178 }
2179 }
2180 else {
2181 if (targetsize < targetlen) {
2182 PyErr_Format(PyExc_SystemError,
2183 "string is longer than the buffer");
2184 if (copy_null && 0 < targetsize)
2185 target[0] = 0;
2186 return NULL;
2187 }
2188 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002189 if (kind == PyUnicode_1BYTE_KIND) {
2190 Py_UCS1 *start = (Py_UCS1 *) data;
2191 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002192 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002193 else if (kind == PyUnicode_2BYTE_KIND) {
2194 Py_UCS2 *start = (Py_UCS2 *) data;
2195 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2196 }
2197 else {
2198 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002199 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002200 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002201 if (copy_null)
2202 target[len] = 0;
2203 return target;
2204}
2205
2206Py_UCS4*
2207PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2208 int copy_null)
2209{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002210 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002211 PyErr_BadInternalCall();
2212 return NULL;
2213 }
2214 return as_ucs4(string, target, targetsize, copy_null);
2215}
2216
2217Py_UCS4*
2218PyUnicode_AsUCS4Copy(PyObject *string)
2219{
2220 return as_ucs4(string, NULL, 0, 1);
2221}
2222
2223#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002224
Alexander Belopolsky40018472011-02-26 01:02:56 +00002225PyObject *
2226PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002227{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002228 if (w == NULL) {
Victor Stinner382955f2011-12-11 21:44:00 +01002229 if (size == 0) {
2230 Py_INCREF(unicode_empty);
2231 return unicode_empty;
2232 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002233 PyErr_BadInternalCall();
2234 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002235 }
2236
Martin v. Löwis790465f2008-04-05 20:41:37 +00002237 if (size == -1) {
2238 size = wcslen(w);
2239 }
2240
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002241 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002242}
2243
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002244#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002245
Walter Dörwald346737f2007-05-31 10:44:43 +00002246static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002247makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2248 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002249{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002250 *fmt++ = '%';
2251 if (width) {
2252 if (zeropad)
2253 *fmt++ = '0';
2254 fmt += sprintf(fmt, "%d", width);
2255 }
2256 if (precision)
2257 fmt += sprintf(fmt, ".%d", precision);
2258 if (longflag)
2259 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002260 else if (longlongflag) {
2261 /* longlongflag should only ever be nonzero on machines with
2262 HAVE_LONG_LONG defined */
2263#ifdef HAVE_LONG_LONG
2264 char *f = PY_FORMAT_LONG_LONG;
2265 while (*f)
2266 *fmt++ = *f++;
2267#else
2268 /* we shouldn't ever get here */
2269 assert(0);
2270 *fmt++ = 'l';
2271#endif
2272 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002273 else if (size_tflag) {
2274 char *f = PY_FORMAT_SIZE_T;
2275 while (*f)
2276 *fmt++ = *f++;
2277 }
2278 *fmt++ = c;
2279 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002280}
2281
Victor Stinner96865452011-03-01 23:44:09 +00002282/* helper for PyUnicode_FromFormatV() */
2283
2284static const char*
2285parse_format_flags(const char *f,
2286 int *p_width, int *p_precision,
2287 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2288{
2289 int width, precision, longflag, longlongflag, size_tflag;
2290
2291 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2292 f++;
2293 width = 0;
2294 while (Py_ISDIGIT((unsigned)*f))
2295 width = (width*10) + *f++ - '0';
2296 precision = 0;
2297 if (*f == '.') {
2298 f++;
2299 while (Py_ISDIGIT((unsigned)*f))
2300 precision = (precision*10) + *f++ - '0';
2301 if (*f == '%') {
2302 /* "%.3%s" => f points to "3" */
2303 f--;
2304 }
2305 }
2306 if (*f == '\0') {
2307 /* bogus format "%.1" => go backward, f points to "1" */
2308 f--;
2309 }
2310 if (p_width != NULL)
2311 *p_width = width;
2312 if (p_precision != NULL)
2313 *p_precision = precision;
2314
2315 /* Handle %ld, %lu, %lld and %llu. */
2316 longflag = 0;
2317 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002318 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002319
2320 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002321 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002322 longflag = 1;
2323 ++f;
2324 }
2325#ifdef HAVE_LONG_LONG
2326 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002327 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002328 longlongflag = 1;
2329 f += 2;
2330 }
2331#endif
2332 }
2333 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002334 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002335 size_tflag = 1;
2336 ++f;
2337 }
2338 if (p_longflag != NULL)
2339 *p_longflag = longflag;
2340 if (p_longlongflag != NULL)
2341 *p_longlongflag = longlongflag;
2342 if (p_size_tflag != NULL)
2343 *p_size_tflag = size_tflag;
2344 return f;
2345}
2346
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002347/* maximum number of characters required for output of %ld. 21 characters
2348 allows for 64-bit integers (in decimal) and an optional sign. */
2349#define MAX_LONG_CHARS 21
2350/* maximum number of characters required for output of %lld.
2351 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2352 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2353#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2354
Walter Dörwaldd2034312007-05-18 16:29:38 +00002355PyObject *
2356PyUnicode_FromFormatV(const char *format, va_list vargs)
2357{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002358 va_list count;
2359 Py_ssize_t callcount = 0;
2360 PyObject **callresults = NULL;
2361 PyObject **callresult = NULL;
2362 Py_ssize_t n = 0;
2363 int width = 0;
2364 int precision = 0;
2365 int zeropad;
2366 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002367 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002368 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002369 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002370 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2371 Py_UCS4 argmaxchar;
2372 Py_ssize_t numbersize = 0;
2373 char *numberresults = NULL;
2374 char *numberresult = NULL;
2375 Py_ssize_t i;
2376 int kind;
2377 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002378
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002379 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002380 /* step 1: count the number of %S/%R/%A/%s format specifications
2381 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2382 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002383 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002384 * also estimate a upper bound for all the number formats in the string,
2385 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002386 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002387 for (f = format; *f; f++) {
2388 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002389 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002390 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2391 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2392 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2393 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002394
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002395 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002396#ifdef HAVE_LONG_LONG
2397 if (longlongflag) {
2398 if (width < MAX_LONG_LONG_CHARS)
2399 width = MAX_LONG_LONG_CHARS;
2400 }
2401 else
2402#endif
2403 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2404 including sign. Decimal takes the most space. This
2405 isn't enough for octal. If a width is specified we
2406 need more (which we allocate later). */
2407 if (width < MAX_LONG_CHARS)
2408 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002409
2410 /* account for the size + '\0' to separate numbers
2411 inside of the numberresults buffer */
2412 numbersize += (width + 1);
2413 }
2414 }
2415 else if ((unsigned char)*f > 127) {
2416 PyErr_Format(PyExc_ValueError,
2417 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2418 "string, got a non-ASCII byte: 0x%02x",
2419 (unsigned char)*f);
2420 return NULL;
2421 }
2422 }
2423 /* step 2: allocate memory for the results of
2424 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2425 if (callcount) {
2426 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2427 if (!callresults) {
2428 PyErr_NoMemory();
2429 return NULL;
2430 }
2431 callresult = callresults;
2432 }
2433 /* step 2.5: allocate memory for the results of formating numbers */
2434 if (numbersize) {
2435 numberresults = PyObject_Malloc(numbersize);
2436 if (!numberresults) {
2437 PyErr_NoMemory();
2438 goto fail;
2439 }
2440 numberresult = numberresults;
2441 }
2442
2443 /* step 3: format numbers and figure out how large a buffer we need */
2444 for (f = format; *f; f++) {
2445 if (*f == '%') {
2446 const char* p;
2447 int longflag;
2448 int longlongflag;
2449 int size_tflag;
2450 int numprinted;
2451
2452 p = f;
2453 zeropad = (f[1] == '0');
2454 f = parse_format_flags(f, &width, &precision,
2455 &longflag, &longlongflag, &size_tflag);
2456 switch (*f) {
2457 case 'c':
2458 {
2459 Py_UCS4 ordinal = va_arg(count, int);
Victor Stinnere6abb482012-05-02 01:15:40 +02002460 maxchar = MAX_MAXCHAR(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002461 n++;
2462 break;
2463 }
2464 case '%':
2465 n++;
2466 break;
2467 case 'i':
2468 case 'd':
2469 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2470 width, precision, *f);
2471 if (longflag)
2472 numprinted = sprintf(numberresult, fmt,
2473 va_arg(count, long));
2474#ifdef HAVE_LONG_LONG
2475 else if (longlongflag)
2476 numprinted = sprintf(numberresult, fmt,
2477 va_arg(count, PY_LONG_LONG));
2478#endif
2479 else if (size_tflag)
2480 numprinted = sprintf(numberresult, fmt,
2481 va_arg(count, Py_ssize_t));
2482 else
2483 numprinted = sprintf(numberresult, fmt,
2484 va_arg(count, int));
2485 n += numprinted;
2486 /* advance by +1 to skip over the '\0' */
2487 numberresult += (numprinted + 1);
2488 assert(*(numberresult - 1) == '\0');
2489 assert(*(numberresult - 2) != '\0');
2490 assert(numprinted >= 0);
2491 assert(numberresult <= numberresults + numbersize);
2492 break;
2493 case 'u':
2494 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2495 width, precision, 'u');
2496 if (longflag)
2497 numprinted = sprintf(numberresult, fmt,
2498 va_arg(count, unsigned long));
2499#ifdef HAVE_LONG_LONG
2500 else if (longlongflag)
2501 numprinted = sprintf(numberresult, fmt,
2502 va_arg(count, unsigned PY_LONG_LONG));
2503#endif
2504 else if (size_tflag)
2505 numprinted = sprintf(numberresult, fmt,
2506 va_arg(count, size_t));
2507 else
2508 numprinted = sprintf(numberresult, fmt,
2509 va_arg(count, unsigned int));
2510 n += numprinted;
2511 numberresult += (numprinted + 1);
2512 assert(*(numberresult - 1) == '\0');
2513 assert(*(numberresult - 2) != '\0');
2514 assert(numprinted >= 0);
2515 assert(numberresult <= numberresults + numbersize);
2516 break;
2517 case 'x':
2518 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2519 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2520 n += numprinted;
2521 numberresult += (numprinted + 1);
2522 assert(*(numberresult - 1) == '\0');
2523 assert(*(numberresult - 2) != '\0');
2524 assert(numprinted >= 0);
2525 assert(numberresult <= numberresults + numbersize);
2526 break;
2527 case 'p':
2528 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2529 /* %p is ill-defined: ensure leading 0x. */
2530 if (numberresult[1] == 'X')
2531 numberresult[1] = 'x';
2532 else if (numberresult[1] != 'x') {
2533 memmove(numberresult + 2, numberresult,
2534 strlen(numberresult) + 1);
2535 numberresult[0] = '0';
2536 numberresult[1] = 'x';
2537 numprinted += 2;
2538 }
2539 n += numprinted;
2540 numberresult += (numprinted + 1);
2541 assert(*(numberresult - 1) == '\0');
2542 assert(*(numberresult - 2) != '\0');
2543 assert(numprinted >= 0);
2544 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002545 break;
2546 case 's':
2547 {
2548 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002549 const char *s = va_arg(count, const char*);
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002550 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002551 if (!str)
2552 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002553 /* since PyUnicode_DecodeUTF8 returns already flexible
2554 unicode objects, there is no need to call ready on them */
2555 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Victor Stinnere6abb482012-05-02 01:15:40 +02002556 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002557 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002558 /* Remember the str and switch to the next slot */
2559 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002560 break;
2561 }
2562 case 'U':
2563 {
2564 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002565 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002566 if (PyUnicode_READY(obj) == -1)
2567 goto fail;
2568 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Victor Stinnere6abb482012-05-02 01:15:40 +02002569 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002570 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002571 break;
2572 }
2573 case 'V':
2574 {
2575 PyObject *obj = va_arg(count, PyObject *);
2576 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002577 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002578 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002579 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002580 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002581 if (PyUnicode_READY(obj) == -1)
2582 goto fail;
2583 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Victor Stinnere6abb482012-05-02 01:15:40 +02002584 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002585 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002586 *callresult++ = NULL;
2587 }
2588 else {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002589 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002590 if (!str_obj)
2591 goto fail;
Benjamin Petersonbac79492012-01-14 13:34:47 -05002592 if (PyUnicode_READY(str_obj) == -1) {
Victor Stinnere1335c72011-10-04 20:53:03 +02002593 Py_DECREF(str_obj);
2594 goto fail;
2595 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002596 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Victor Stinnere6abb482012-05-02 01:15:40 +02002597 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002598 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002599 *callresult++ = str_obj;
2600 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002601 break;
2602 }
2603 case 'S':
2604 {
2605 PyObject *obj = va_arg(count, PyObject *);
2606 PyObject *str;
2607 assert(obj);
2608 str = PyObject_Str(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002609 if (!str)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002610 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002611 if (PyUnicode_READY(str) == -1) {
2612 Py_DECREF(str);
2613 goto fail;
2614 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002615 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Victor Stinnere6abb482012-05-02 01:15:40 +02002616 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002617 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002618 /* Remember the str and switch to the next slot */
2619 *callresult++ = str;
2620 break;
2621 }
2622 case 'R':
2623 {
2624 PyObject *obj = va_arg(count, PyObject *);
2625 PyObject *repr;
2626 assert(obj);
2627 repr = PyObject_Repr(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002628 if (!repr)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002629 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002630 if (PyUnicode_READY(repr) == -1) {
2631 Py_DECREF(repr);
2632 goto fail;
2633 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002634 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Victor Stinnere6abb482012-05-02 01:15:40 +02002635 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002636 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002637 /* Remember the repr and switch to the next slot */
2638 *callresult++ = repr;
2639 break;
2640 }
2641 case 'A':
2642 {
2643 PyObject *obj = va_arg(count, PyObject *);
2644 PyObject *ascii;
2645 assert(obj);
2646 ascii = PyObject_ASCII(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002647 if (!ascii)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002648 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002649 if (PyUnicode_READY(ascii) == -1) {
2650 Py_DECREF(ascii);
2651 goto fail;
2652 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002653 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Victor Stinnere6abb482012-05-02 01:15:40 +02002654 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002655 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002656 /* Remember the repr and switch to the next slot */
2657 *callresult++ = ascii;
2658 break;
2659 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002660 default:
2661 /* if we stumble upon an unknown
2662 formatting code, copy the rest of
2663 the format string to the output
2664 string. (we cannot just skip the
2665 code, since there's no way to know
2666 what's in the argument list) */
2667 n += strlen(p);
2668 goto expand;
2669 }
2670 } else
2671 n++;
2672 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002673 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002674 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002675 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002676 we don't have to resize the string.
2677 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002678 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002679 if (!string)
2680 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002681 kind = PyUnicode_KIND(string);
2682 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002683 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002684 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002685
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002686 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002687 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002688 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002689
2690 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002691 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2692 /* checking for == because the last argument could be a empty
2693 string, which causes i to point to end, the assert at the end of
2694 the loop */
2695 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002696
Benjamin Peterson14339b62009-01-31 16:36:08 +00002697 switch (*f) {
2698 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002699 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002700 const int ordinal = va_arg(vargs, int);
2701 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002702 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002703 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002704 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002705 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002706 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002707 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002708 case 'p':
Victor Stinnerc5166102012-02-22 13:55:02 +01002709 {
2710 Py_ssize_t written;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002711 /* unused, since we already have the result */
2712 if (*f == 'p')
2713 (void) va_arg(vargs, void *);
2714 else
2715 (void) va_arg(vargs, int);
2716 /* extract the result from numberresults and append. */
Victor Stinnerc5166102012-02-22 13:55:02 +01002717 written = unicode_write_cstr(string, i, numberresult);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002718 /* skip over the separating '\0' */
Victor Stinnerc5166102012-02-22 13:55:02 +01002719 i += written;
2720 numberresult += written;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002721 assert(*numberresult == '\0');
2722 numberresult++;
2723 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002724 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002725 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002726 case 's':
2727 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002728 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002729 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002730 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002731 size = PyUnicode_GET_LENGTH(*callresult);
2732 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002733 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002734 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002735 /* We're done with the unicode()/repr() => forget it */
2736 Py_DECREF(*callresult);
2737 /* switch to next unicode()/repr() result */
2738 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002739 break;
2740 }
2741 case 'U':
2742 {
2743 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002744 Py_ssize_t size;
2745 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2746 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002747 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002748 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002749 break;
2750 }
2751 case 'V':
2752 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002753 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002754 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002755 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002756 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002757 size = PyUnicode_GET_LENGTH(obj);
2758 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002759 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002760 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002761 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002762 size = PyUnicode_GET_LENGTH(*callresult);
2763 assert(PyUnicode_KIND(*callresult) <=
2764 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002765 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002766 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002767 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002768 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002769 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002770 break;
2771 }
2772 case 'S':
2773 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002774 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002775 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002776 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002777 /* unused, since we already have the result */
2778 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002779 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002780 copy_characters(string, i, *callresult, 0, size);
2781 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002782 /* We're done with the unicode()/repr() => forget it */
2783 Py_DECREF(*callresult);
2784 /* switch to next unicode()/repr() result */
2785 ++callresult;
2786 break;
2787 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002788 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002789 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002790 break;
2791 default:
Victor Stinnerc5166102012-02-22 13:55:02 +01002792 i += unicode_write_cstr(string, i, p);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002793 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002794 goto end;
2795 }
Victor Stinner1205f272010-09-11 00:54:47 +00002796 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002797 else {
2798 assert(i < PyUnicode_GET_LENGTH(string));
2799 PyUnicode_WRITE(kind, data, i++, *f);
2800 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002801 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002802 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002803
Benjamin Peterson29060642009-01-31 22:14:21 +00002804 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002805 if (callresults)
2806 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002807 if (numberresults)
2808 PyObject_Free(numberresults);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002809 return unicode_result(string);
Benjamin Peterson29060642009-01-31 22:14:21 +00002810 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002811 if (callresults) {
2812 PyObject **callresult2 = callresults;
2813 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002814 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002815 ++callresult2;
2816 }
2817 PyObject_Free(callresults);
2818 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002819 if (numberresults)
2820 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002821 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002822}
2823
Walter Dörwaldd2034312007-05-18 16:29:38 +00002824PyObject *
2825PyUnicode_FromFormat(const char *format, ...)
2826{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002827 PyObject* ret;
2828 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002829
2830#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002831 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002832#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002833 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002834#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002835 ret = PyUnicode_FromFormatV(format, vargs);
2836 va_end(vargs);
2837 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002838}
2839
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002840#ifdef HAVE_WCHAR_H
2841
Victor Stinner5593d8a2010-10-02 11:11:27 +00002842/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2843 convert a Unicode object to a wide character string.
2844
Victor Stinnerd88d9832011-09-06 02:00:05 +02002845 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002846 character) required to convert the unicode object. Ignore size argument.
2847
Victor Stinnerd88d9832011-09-06 02:00:05 +02002848 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002849 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002850 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002851static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002852unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002853 wchar_t *w,
2854 Py_ssize_t size)
2855{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002856 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002857 const wchar_t *wstr;
2858
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002859 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002860 if (wstr == NULL)
2861 return -1;
2862
Victor Stinner5593d8a2010-10-02 11:11:27 +00002863 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002864 if (size > res)
2865 size = res + 1;
2866 else
2867 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002868 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002869 return res;
2870 }
2871 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002872 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002873}
2874
2875Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002876PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002877 wchar_t *w,
2878 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002879{
2880 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002881 PyErr_BadInternalCall();
2882 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002883 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002884 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002885}
2886
Victor Stinner137c34c2010-09-29 10:25:54 +00002887wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002888PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002889 Py_ssize_t *size)
2890{
2891 wchar_t* buffer;
2892 Py_ssize_t buflen;
2893
2894 if (unicode == NULL) {
2895 PyErr_BadInternalCall();
2896 return NULL;
2897 }
2898
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002899 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002900 if (buflen == -1)
2901 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002902 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002903 PyErr_NoMemory();
2904 return NULL;
2905 }
2906
Victor Stinner137c34c2010-09-29 10:25:54 +00002907 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2908 if (buffer == NULL) {
2909 PyErr_NoMemory();
2910 return NULL;
2911 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002912 buflen = unicode_aswidechar(unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002913 if (buflen == -1)
2914 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002915 if (size != NULL)
2916 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002917 return buffer;
2918}
2919
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002920#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002921
Alexander Belopolsky40018472011-02-26 01:02:56 +00002922PyObject *
2923PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002924{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002925 PyObject *v;
Victor Stinner8faf8212011-12-08 22:14:11 +01002926 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002927 PyErr_SetString(PyExc_ValueError,
2928 "chr() arg not in range(0x110000)");
2929 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002930 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002931
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002932 if (ordinal < 256)
2933 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002934
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002935 v = PyUnicode_New(1, ordinal);
2936 if (v == NULL)
2937 return NULL;
2938 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002939 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002940 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002941}
2942
Alexander Belopolsky40018472011-02-26 01:02:56 +00002943PyObject *
2944PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002945{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002946 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002947 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002948 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002949 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002950 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002951 Py_INCREF(obj);
2952 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002953 }
2954 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002955 /* For a Unicode subtype that's not a Unicode object,
2956 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002957 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002958 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002959 PyErr_Format(PyExc_TypeError,
2960 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002961 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002962 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002963}
2964
Alexander Belopolsky40018472011-02-26 01:02:56 +00002965PyObject *
2966PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002967 const char *encoding,
2968 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002969{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002970 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002971 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002972
Guido van Rossumd57fd912000-03-10 22:53:23 +00002973 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002974 PyErr_BadInternalCall();
2975 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002976 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002977
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002978 /* Decoding bytes objects is the most common case and should be fast */
2979 if (PyBytes_Check(obj)) {
2980 if (PyBytes_GET_SIZE(obj) == 0) {
2981 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002982 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002983 }
2984 else {
2985 v = PyUnicode_Decode(
2986 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2987 encoding, errors);
2988 }
2989 return v;
2990 }
2991
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002992 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002993 PyErr_SetString(PyExc_TypeError,
2994 "decoding str is not supported");
2995 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002996 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002997
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002998 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2999 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3000 PyErr_Format(PyExc_TypeError,
3001 "coercing to str: need bytes, bytearray "
3002 "or buffer-like object, %.80s found",
3003 Py_TYPE(obj)->tp_name);
3004 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003005 }
Tim Petersced69f82003-09-16 20:30:58 +00003006
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003007 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003008 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02003009 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003010 }
Tim Petersced69f82003-09-16 20:30:58 +00003011 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003012 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003013
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003014 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003015 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003016}
3017
Victor Stinner600d3be2010-06-10 12:00:55 +00003018/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00003019 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
3020 1 on success. */
3021static int
3022normalize_encoding(const char *encoding,
3023 char *lower,
3024 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003025{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003026 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003027 char *l;
3028 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003029
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04003030 if (encoding == NULL) {
3031 strcpy(lower, "utf-8");
3032 return 1;
3033 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003034 e = encoding;
3035 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003036 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00003037 while (*e) {
3038 if (l == l_end)
3039 return 0;
David Malcolm96960882010-11-05 17:23:41 +00003040 if (Py_ISUPPER(*e)) {
3041 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003042 }
3043 else if (*e == '_') {
3044 *l++ = '-';
3045 e++;
3046 }
3047 else {
3048 *l++ = *e++;
3049 }
3050 }
3051 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003052 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003053}
3054
Alexander Belopolsky40018472011-02-26 01:02:56 +00003055PyObject *
3056PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003057 Py_ssize_t size,
3058 const char *encoding,
3059 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003060{
3061 PyObject *buffer = NULL, *unicode;
3062 Py_buffer info;
3063 char lower[11]; /* Enough for any encoding shortcut */
3064
Fred Drakee4315f52000-05-09 19:53:39 +00003065 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003066 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003067 if ((strcmp(lower, "utf-8") == 0) ||
3068 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003069 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00003070 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003071 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003072 (strcmp(lower, "iso-8859-1") == 0))
3073 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003074#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003075 else if (strcmp(lower, "mbcs") == 0)
3076 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003077#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003078 else if (strcmp(lower, "ascii") == 0)
3079 return PyUnicode_DecodeASCII(s, size, errors);
3080 else if (strcmp(lower, "utf-16") == 0)
3081 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3082 else if (strcmp(lower, "utf-32") == 0)
3083 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3084 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003085
3086 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003087 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003088 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003089 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003090 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003091 if (buffer == NULL)
3092 goto onError;
3093 unicode = PyCodec_Decode(buffer, encoding, errors);
3094 if (unicode == NULL)
3095 goto onError;
3096 if (!PyUnicode_Check(unicode)) {
3097 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003098 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00003099 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003100 Py_DECREF(unicode);
3101 goto onError;
3102 }
3103 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003104 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003105
Benjamin Peterson29060642009-01-31 22:14:21 +00003106 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003107 Py_XDECREF(buffer);
3108 return NULL;
3109}
3110
Alexander Belopolsky40018472011-02-26 01:02:56 +00003111PyObject *
3112PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003113 const char *encoding,
3114 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003115{
3116 PyObject *v;
3117
3118 if (!PyUnicode_Check(unicode)) {
3119 PyErr_BadArgument();
3120 goto onError;
3121 }
3122
3123 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003124 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003125
3126 /* Decode via the codec registry */
3127 v = PyCodec_Decode(unicode, encoding, errors);
3128 if (v == NULL)
3129 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003130 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003131
Benjamin Peterson29060642009-01-31 22:14:21 +00003132 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003133 return NULL;
3134}
3135
Alexander Belopolsky40018472011-02-26 01:02:56 +00003136PyObject *
3137PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003138 const char *encoding,
3139 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003140{
3141 PyObject *v;
3142
3143 if (!PyUnicode_Check(unicode)) {
3144 PyErr_BadArgument();
3145 goto onError;
3146 }
3147
3148 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003149 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003150
3151 /* Decode via the codec registry */
3152 v = PyCodec_Decode(unicode, encoding, errors);
3153 if (v == NULL)
3154 goto onError;
3155 if (!PyUnicode_Check(v)) {
3156 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003157 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003158 Py_TYPE(v)->tp_name);
3159 Py_DECREF(v);
3160 goto onError;
3161 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003162 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003163
Benjamin Peterson29060642009-01-31 22:14:21 +00003164 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003165 return NULL;
3166}
3167
Alexander Belopolsky40018472011-02-26 01:02:56 +00003168PyObject *
3169PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003170 Py_ssize_t size,
3171 const char *encoding,
3172 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003173{
3174 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003175
Guido van Rossumd57fd912000-03-10 22:53:23 +00003176 unicode = PyUnicode_FromUnicode(s, size);
3177 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003178 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003179 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3180 Py_DECREF(unicode);
3181 return v;
3182}
3183
Alexander Belopolsky40018472011-02-26 01:02:56 +00003184PyObject *
3185PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003186 const char *encoding,
3187 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003188{
3189 PyObject *v;
3190
3191 if (!PyUnicode_Check(unicode)) {
3192 PyErr_BadArgument();
3193 goto onError;
3194 }
3195
3196 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003197 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003198
3199 /* Encode via the codec registry */
3200 v = PyCodec_Encode(unicode, encoding, errors);
3201 if (v == NULL)
3202 goto onError;
3203 return v;
3204
Benjamin Peterson29060642009-01-31 22:14:21 +00003205 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003206 return NULL;
3207}
3208
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003209static size_t
3210wcstombs_errorpos(const wchar_t *wstr)
3211{
3212 size_t len;
3213#if SIZEOF_WCHAR_T == 2
3214 wchar_t buf[3];
3215#else
3216 wchar_t buf[2];
3217#endif
3218 char outbuf[MB_LEN_MAX];
3219 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003220
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003221#if SIZEOF_WCHAR_T == 2
3222 buf[2] = 0;
3223#else
3224 buf[1] = 0;
3225#endif
3226 start = wstr;
3227 while (*wstr != L'\0')
3228 {
3229 previous = wstr;
3230#if SIZEOF_WCHAR_T == 2
3231 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3232 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3233 {
3234 buf[0] = wstr[0];
3235 buf[1] = wstr[1];
3236 wstr += 2;
3237 }
3238 else {
3239 buf[0] = *wstr;
3240 buf[1] = 0;
3241 wstr++;
3242 }
3243#else
3244 buf[0] = *wstr;
3245 wstr++;
3246#endif
3247 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003248 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003249 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003250 }
3251
3252 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003253 return 0;
3254}
3255
Victor Stinner1b579672011-12-17 05:47:23 +01003256static int
3257locale_error_handler(const char *errors, int *surrogateescape)
3258{
3259 if (errors == NULL) {
3260 *surrogateescape = 0;
3261 return 0;
3262 }
3263
3264 if (strcmp(errors, "strict") == 0) {
3265 *surrogateescape = 0;
3266 return 0;
3267 }
3268 if (strcmp(errors, "surrogateescape") == 0) {
3269 *surrogateescape = 1;
3270 return 0;
3271 }
3272 PyErr_Format(PyExc_ValueError,
3273 "only 'strict' and 'surrogateescape' error handlers "
3274 "are supported, not '%s'",
3275 errors);
3276 return -1;
3277}
3278
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003279PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003280PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003281{
3282 Py_ssize_t wlen, wlen2;
3283 wchar_t *wstr;
3284 PyObject *bytes = NULL;
3285 char *errmsg;
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003286 PyObject *reason;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003287 PyObject *exc;
3288 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003289 int surrogateescape;
3290
3291 if (locale_error_handler(errors, &surrogateescape) < 0)
3292 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003293
3294 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3295 if (wstr == NULL)
3296 return NULL;
3297
3298 wlen2 = wcslen(wstr);
3299 if (wlen2 != wlen) {
3300 PyMem_Free(wstr);
3301 PyErr_SetString(PyExc_TypeError, "embedded null character");
3302 return NULL;
3303 }
3304
3305 if (surrogateescape) {
3306 /* locale encoding with surrogateescape */
3307 char *str;
3308
3309 str = _Py_wchar2char(wstr, &error_pos);
3310 if (str == NULL) {
3311 if (error_pos == (size_t)-1) {
3312 PyErr_NoMemory();
3313 PyMem_Free(wstr);
3314 return NULL;
3315 }
3316 else {
3317 goto encode_error;
3318 }
3319 }
3320 PyMem_Free(wstr);
3321
3322 bytes = PyBytes_FromString(str);
3323 PyMem_Free(str);
3324 }
3325 else {
3326 size_t len, len2;
3327
3328 len = wcstombs(NULL, wstr, 0);
3329 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003330 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003331 goto encode_error;
3332 }
3333
3334 bytes = PyBytes_FromStringAndSize(NULL, len);
3335 if (bytes == NULL) {
3336 PyMem_Free(wstr);
3337 return NULL;
3338 }
3339
3340 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3341 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003342 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003343 goto encode_error;
3344 }
3345 PyMem_Free(wstr);
3346 }
3347 return bytes;
3348
3349encode_error:
3350 errmsg = strerror(errno);
3351 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003352
3353 if (error_pos == (size_t)-1)
3354 error_pos = wcstombs_errorpos(wstr);
3355
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003356 PyMem_Free(wstr);
3357 Py_XDECREF(bytes);
3358
Victor Stinner2f197072011-12-17 07:08:30 +01003359 if (errmsg != NULL) {
3360 size_t errlen;
3361 wstr = _Py_char2wchar(errmsg, &errlen);
3362 if (wstr != NULL) {
3363 reason = PyUnicode_FromWideChar(wstr, errlen);
3364 PyMem_Free(wstr);
3365 } else
3366 errmsg = NULL;
3367 }
3368 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003369 reason = PyUnicode_FromString(
3370 "wcstombs() encountered an unencodable "
3371 "wide character");
3372 if (reason == NULL)
3373 return NULL;
3374
3375 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3376 "locale", unicode,
3377 (Py_ssize_t)error_pos,
3378 (Py_ssize_t)(error_pos+1),
3379 reason);
3380 Py_DECREF(reason);
3381 if (exc != NULL) {
3382 PyCodec_StrictErrors(exc);
3383 Py_XDECREF(exc);
3384 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003385 return NULL;
3386}
3387
Victor Stinnerad158722010-10-27 00:25:46 +00003388PyObject *
3389PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003390{
Victor Stinner99b95382011-07-04 14:23:54 +02003391#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003392 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003393#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003394 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003395#else
Victor Stinner793b5312011-04-27 00:24:21 +02003396 PyInterpreterState *interp = PyThreadState_GET()->interp;
3397 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3398 cannot use it to encode and decode filenames before it is loaded. Load
3399 the Python codec requires to encode at least its own filename. Use the C
3400 version of the locale codec until the codec registry is initialized and
3401 the Python codec is loaded.
3402
3403 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3404 cannot only rely on it: check also interp->fscodec_initialized for
3405 subinterpreters. */
3406 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003407 return PyUnicode_AsEncodedString(unicode,
3408 Py_FileSystemDefaultEncoding,
3409 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003410 }
3411 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003412 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003413 }
Victor Stinnerad158722010-10-27 00:25:46 +00003414#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003415}
3416
Alexander Belopolsky40018472011-02-26 01:02:56 +00003417PyObject *
3418PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003419 const char *encoding,
3420 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003421{
3422 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003423 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003424
Guido van Rossumd57fd912000-03-10 22:53:23 +00003425 if (!PyUnicode_Check(unicode)) {
3426 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003427 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003428 }
Fred Drakee4315f52000-05-09 19:53:39 +00003429
Fred Drakee4315f52000-05-09 19:53:39 +00003430 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003431 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003432 if ((strcmp(lower, "utf-8") == 0) ||
3433 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003434 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003435 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003436 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003437 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003438 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003439 }
Victor Stinner37296e82010-06-10 13:36:23 +00003440 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003441 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003442 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003443 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003444#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003445 else if (strcmp(lower, "mbcs") == 0)
3446 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003447#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003448 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003449 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003450 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003451
3452 /* Encode via the codec registry */
3453 v = PyCodec_Encode(unicode, encoding, errors);
3454 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003455 return NULL;
3456
3457 /* The normal path */
3458 if (PyBytes_Check(v))
3459 return v;
3460
3461 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003462 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003463 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003464 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003465
3466 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3467 "encoder %s returned bytearray instead of bytes",
3468 encoding);
3469 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003470 Py_DECREF(v);
3471 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003472 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003473
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003474 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3475 Py_DECREF(v);
3476 return b;
3477 }
3478
3479 PyErr_Format(PyExc_TypeError,
3480 "encoder did not return a bytes object (type=%.400s)",
3481 Py_TYPE(v)->tp_name);
3482 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003483 return NULL;
3484}
3485
Alexander Belopolsky40018472011-02-26 01:02:56 +00003486PyObject *
3487PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003488 const char *encoding,
3489 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003490{
3491 PyObject *v;
3492
3493 if (!PyUnicode_Check(unicode)) {
3494 PyErr_BadArgument();
3495 goto onError;
3496 }
3497
3498 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003499 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003500
3501 /* Encode via the codec registry */
3502 v = PyCodec_Encode(unicode, encoding, errors);
3503 if (v == NULL)
3504 goto onError;
3505 if (!PyUnicode_Check(v)) {
3506 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003507 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003508 Py_TYPE(v)->tp_name);
3509 Py_DECREF(v);
3510 goto onError;
3511 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003512 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003513
Benjamin Peterson29060642009-01-31 22:14:21 +00003514 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003515 return NULL;
3516}
3517
Victor Stinner2f197072011-12-17 07:08:30 +01003518static size_t
3519mbstowcs_errorpos(const char *str, size_t len)
3520{
3521#ifdef HAVE_MBRTOWC
3522 const char *start = str;
3523 mbstate_t mbs;
3524 size_t converted;
3525 wchar_t ch;
3526
3527 memset(&mbs, 0, sizeof mbs);
3528 while (len)
3529 {
3530 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3531 if (converted == 0)
3532 /* Reached end of string */
3533 break;
3534 if (converted == (size_t)-1 || converted == (size_t)-2) {
3535 /* Conversion error or incomplete character */
3536 return str - start;
3537 }
3538 else {
3539 str += converted;
3540 len -= converted;
3541 }
3542 }
3543 /* failed to find the undecodable byte sequence */
3544 return 0;
3545#endif
3546 return 0;
3547}
3548
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003549PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003550PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003551 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003552{
3553 wchar_t smallbuf[256];
3554 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3555 wchar_t *wstr;
3556 size_t wlen, wlen2;
3557 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003558 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003559 size_t error_pos;
3560 char *errmsg;
3561 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003562
3563 if (locale_error_handler(errors, &surrogateescape) < 0)
3564 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003565
3566 if (str[len] != '\0' || len != strlen(str)) {
3567 PyErr_SetString(PyExc_TypeError, "embedded null character");
3568 return NULL;
3569 }
3570
3571 if (surrogateescape)
3572 {
3573 wstr = _Py_char2wchar(str, &wlen);
3574 if (wstr == NULL) {
3575 if (wlen == (size_t)-1)
3576 PyErr_NoMemory();
3577 else
3578 PyErr_SetFromErrno(PyExc_OSError);
3579 return NULL;
3580 }
3581
3582 unicode = PyUnicode_FromWideChar(wstr, wlen);
3583 PyMem_Free(wstr);
3584 }
3585 else {
3586#ifndef HAVE_BROKEN_MBSTOWCS
3587 wlen = mbstowcs(NULL, str, 0);
3588#else
3589 wlen = len;
3590#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003591 if (wlen == (size_t)-1)
3592 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003593 if (wlen+1 <= smallbuf_len) {
3594 wstr = smallbuf;
3595 }
3596 else {
3597 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3598 return PyErr_NoMemory();
3599
3600 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3601 if (!wstr)
3602 return PyErr_NoMemory();
3603 }
3604
3605 /* This shouldn't fail now */
3606 wlen2 = mbstowcs(wstr, str, wlen+1);
3607 if (wlen2 == (size_t)-1) {
3608 if (wstr != smallbuf)
3609 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003610 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003611 }
3612#ifdef HAVE_BROKEN_MBSTOWCS
3613 assert(wlen2 == wlen);
3614#endif
3615 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3616 if (wstr != smallbuf)
3617 PyMem_Free(wstr);
3618 }
3619 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003620
3621decode_error:
3622 errmsg = strerror(errno);
3623 assert(errmsg != NULL);
3624
3625 error_pos = mbstowcs_errorpos(str, len);
3626 if (errmsg != NULL) {
3627 size_t errlen;
3628 wstr = _Py_char2wchar(errmsg, &errlen);
3629 if (wstr != NULL) {
3630 reason = PyUnicode_FromWideChar(wstr, errlen);
3631 PyMem_Free(wstr);
3632 } else
3633 errmsg = NULL;
3634 }
3635 if (errmsg == NULL)
3636 reason = PyUnicode_FromString(
3637 "mbstowcs() encountered an invalid multibyte sequence");
3638 if (reason == NULL)
3639 return NULL;
3640
3641 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3642 "locale", str, len,
3643 (Py_ssize_t)error_pos,
3644 (Py_ssize_t)(error_pos+1),
3645 reason);
3646 Py_DECREF(reason);
3647 if (exc != NULL) {
3648 PyCodec_StrictErrors(exc);
3649 Py_XDECREF(exc);
3650 }
3651 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003652}
3653
3654PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003655PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003656{
3657 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003658 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003659}
3660
3661
3662PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003663PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003664 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003665 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3666}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003667
Christian Heimes5894ba72007-11-04 11:43:14 +00003668PyObject*
3669PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3670{
Victor Stinner99b95382011-07-04 14:23:54 +02003671#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003672 return PyUnicode_DecodeMBCS(s, size, NULL);
3673#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003674 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003675#else
Victor Stinner793b5312011-04-27 00:24:21 +02003676 PyInterpreterState *interp = PyThreadState_GET()->interp;
3677 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3678 cannot use it to encode and decode filenames before it is loaded. Load
3679 the Python codec requires to encode at least its own filename. Use the C
3680 version of the locale codec until the codec registry is initialized and
3681 the Python codec is loaded.
3682
3683 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3684 cannot only rely on it: check also interp->fscodec_initialized for
3685 subinterpreters. */
3686 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003687 return PyUnicode_Decode(s, size,
3688 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003689 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003690 }
3691 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003692 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003693 }
Victor Stinnerad158722010-10-27 00:25:46 +00003694#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003695}
3696
Martin v. Löwis011e8422009-05-05 04:43:17 +00003697
3698int
Antoine Pitrou13348842012-01-29 18:36:34 +01003699_PyUnicode_HasNULChars(PyObject* s)
3700{
3701 static PyObject *nul = NULL;
3702
3703 if (nul == NULL)
3704 nul = PyUnicode_FromStringAndSize("\0", 1);
3705 if (nul == NULL)
3706 return -1;
3707 return PyUnicode_Contains(s, nul);
3708}
3709
3710
3711int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003712PyUnicode_FSConverter(PyObject* arg, void* addr)
3713{
3714 PyObject *output = NULL;
3715 Py_ssize_t size;
3716 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003717 if (arg == NULL) {
3718 Py_DECREF(*(PyObject**)addr);
3719 return 1;
3720 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003721 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003722 output = arg;
3723 Py_INCREF(output);
3724 }
3725 else {
3726 arg = PyUnicode_FromObject(arg);
3727 if (!arg)
3728 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003729 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003730 Py_DECREF(arg);
3731 if (!output)
3732 return 0;
3733 if (!PyBytes_Check(output)) {
3734 Py_DECREF(output);
3735 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3736 return 0;
3737 }
3738 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003739 size = PyBytes_GET_SIZE(output);
3740 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003741 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003742 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003743 Py_DECREF(output);
3744 return 0;
3745 }
3746 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003747 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003748}
3749
3750
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003751int
3752PyUnicode_FSDecoder(PyObject* arg, void* addr)
3753{
3754 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003755 if (arg == NULL) {
3756 Py_DECREF(*(PyObject**)addr);
3757 return 1;
3758 }
3759 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003760 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003761 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003762 output = arg;
3763 Py_INCREF(output);
3764 }
3765 else {
3766 arg = PyBytes_FromObject(arg);
3767 if (!arg)
3768 return 0;
3769 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3770 PyBytes_GET_SIZE(arg));
3771 Py_DECREF(arg);
3772 if (!output)
3773 return 0;
3774 if (!PyUnicode_Check(output)) {
3775 Py_DECREF(output);
3776 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3777 return 0;
3778 }
3779 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003780 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003781 Py_DECREF(output);
3782 return 0;
3783 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003784 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003785 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003786 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3787 Py_DECREF(output);
3788 return 0;
3789 }
3790 *(PyObject**)addr = output;
3791 return Py_CLEANUP_SUPPORTED;
3792}
3793
3794
Martin v. Löwis5b222132007-06-10 09:51:05 +00003795char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003796PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003797{
Christian Heimesf3863112007-11-22 07:46:41 +00003798 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003799
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003800 if (!PyUnicode_Check(unicode)) {
3801 PyErr_BadArgument();
3802 return NULL;
3803 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003804 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003805 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003806
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003807 if (PyUnicode_UTF8(unicode) == NULL) {
3808 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003809 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3810 if (bytes == NULL)
3811 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003812 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3813 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003814 Py_DECREF(bytes);
3815 return NULL;
3816 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003817 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3818 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3819 PyBytes_AS_STRING(bytes),
3820 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003821 Py_DECREF(bytes);
3822 }
3823
3824 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003825 *psize = PyUnicode_UTF8_LENGTH(unicode);
3826 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003827}
3828
3829char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003830PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003831{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003832 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3833}
3834
3835#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02003836static int unicode_as_unicode_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003837#endif
3838
3839
3840Py_UNICODE *
3841PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3842{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003843 const unsigned char *one_byte;
3844#if SIZEOF_WCHAR_T == 4
3845 const Py_UCS2 *two_bytes;
3846#else
3847 const Py_UCS4 *four_bytes;
3848 const Py_UCS4 *ucs4_end;
3849 Py_ssize_t num_surrogates;
3850#endif
3851 wchar_t *w;
3852 wchar_t *wchar_end;
3853
3854 if (!PyUnicode_Check(unicode)) {
3855 PyErr_BadArgument();
3856 return NULL;
3857 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003858 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003859 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003860 assert(_PyUnicode_KIND(unicode) != 0);
3861 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003862
3863#ifdef Py_DEBUG
3864 ++unicode_as_unicode_calls;
3865#endif
3866
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003867 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003868#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003869 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3870 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003871 num_surrogates = 0;
3872
3873 for (; four_bytes < ucs4_end; ++four_bytes) {
3874 if (*four_bytes > 0xFFFF)
3875 ++num_surrogates;
3876 }
3877
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003878 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3879 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3880 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003881 PyErr_NoMemory();
3882 return NULL;
3883 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003884 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003885
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003886 w = _PyUnicode_WSTR(unicode);
3887 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3888 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003889 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3890 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003891 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003892 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003893 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3894 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003895 }
3896 else
3897 *w = *four_bytes;
3898
3899 if (w > wchar_end) {
3900 assert(0 && "Miscalculated string end");
3901 }
3902 }
3903 *w = 0;
3904#else
3905 /* sizeof(wchar_t) == 4 */
3906 Py_FatalError("Impossible unicode object state, wstr and str "
3907 "should share memory already.");
3908 return NULL;
3909#endif
3910 }
3911 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003912 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3913 (_PyUnicode_LENGTH(unicode) + 1));
3914 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003915 PyErr_NoMemory();
3916 return NULL;
3917 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003918 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3919 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3920 w = _PyUnicode_WSTR(unicode);
3921 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003922
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003923 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3924 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003925 for (; w < wchar_end; ++one_byte, ++w)
3926 *w = *one_byte;
3927 /* null-terminate the wstr */
3928 *w = 0;
3929 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003930 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003931#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003932 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003933 for (; w < wchar_end; ++two_bytes, ++w)
3934 *w = *two_bytes;
3935 /* null-terminate the wstr */
3936 *w = 0;
3937#else
3938 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003939 PyObject_FREE(_PyUnicode_WSTR(unicode));
3940 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003941 Py_FatalError("Impossible unicode object state, wstr "
3942 "and str should share memory already.");
3943 return NULL;
3944#endif
3945 }
3946 else {
3947 assert(0 && "This should never happen.");
3948 }
3949 }
3950 }
3951 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003952 *size = PyUnicode_WSTR_LENGTH(unicode);
3953 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003954}
3955
Alexander Belopolsky40018472011-02-26 01:02:56 +00003956Py_UNICODE *
3957PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003958{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003959 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003960}
3961
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003962
Alexander Belopolsky40018472011-02-26 01:02:56 +00003963Py_ssize_t
3964PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003965{
3966 if (!PyUnicode_Check(unicode)) {
3967 PyErr_BadArgument();
3968 goto onError;
3969 }
3970 return PyUnicode_GET_SIZE(unicode);
3971
Benjamin Peterson29060642009-01-31 22:14:21 +00003972 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003973 return -1;
3974}
3975
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003976Py_ssize_t
3977PyUnicode_GetLength(PyObject *unicode)
3978{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003979 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003980 PyErr_BadArgument();
3981 return -1;
3982 }
3983
3984 return PyUnicode_GET_LENGTH(unicode);
3985}
3986
3987Py_UCS4
3988PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3989{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003990 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3991 PyErr_BadArgument();
3992 return (Py_UCS4)-1;
3993 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003994 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003995 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003996 return (Py_UCS4)-1;
3997 }
3998 return PyUnicode_READ_CHAR(unicode, index);
3999}
4000
4001int
4002PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4003{
4004 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004005 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004006 return -1;
4007 }
Victor Stinner488fa492011-12-12 00:01:39 +01004008 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004009 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004010 PyErr_SetString(PyExc_IndexError, "string index out of range");
4011 return -1;
4012 }
Victor Stinner488fa492011-12-12 00:01:39 +01004013 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004014 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004015 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4016 PyErr_SetString(PyExc_ValueError, "character out of range");
4017 return -1;
4018 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004019 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4020 index, ch);
4021 return 0;
4022}
4023
Alexander Belopolsky40018472011-02-26 01:02:56 +00004024const char *
4025PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004026{
Victor Stinner42cb4622010-09-01 19:39:01 +00004027 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004028}
4029
Victor Stinner554f3f02010-06-16 23:33:54 +00004030/* create or adjust a UnicodeDecodeError */
4031static void
4032make_decode_exception(PyObject **exceptionObject,
4033 const char *encoding,
4034 const char *input, Py_ssize_t length,
4035 Py_ssize_t startpos, Py_ssize_t endpos,
4036 const char *reason)
4037{
4038 if (*exceptionObject == NULL) {
4039 *exceptionObject = PyUnicodeDecodeError_Create(
4040 encoding, input, length, startpos, endpos, reason);
4041 }
4042 else {
4043 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4044 goto onError;
4045 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4046 goto onError;
4047 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4048 goto onError;
4049 }
4050 return;
4051
4052onError:
4053 Py_DECREF(*exceptionObject);
4054 *exceptionObject = NULL;
4055}
4056
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004057/* error handling callback helper:
4058 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004059 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004060 and adjust various state variables.
4061 return 0 on success, -1 on error
4062*/
4063
Alexander Belopolsky40018472011-02-26 01:02:56 +00004064static int
4065unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004066 const char *encoding, const char *reason,
4067 const char **input, const char **inend, Py_ssize_t *startinpos,
4068 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004069 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004070{
Benjamin Peterson142957c2008-07-04 19:55:29 +00004071 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004072
4073 PyObject *restuple = NULL;
4074 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004075 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004076 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004077 Py_ssize_t requiredsize;
4078 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004079 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004080 int res = -1;
4081
Victor Stinner596a6c42011-11-09 00:02:18 +01004082 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
4083 outsize = PyUnicode_GET_LENGTH(*output);
4084 else
4085 outsize = _PyUnicode_WSTR_LENGTH(*output);
4086
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004087 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004088 *errorHandler = PyCodec_LookupError(errors);
4089 if (*errorHandler == NULL)
4090 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004091 }
4092
Victor Stinner554f3f02010-06-16 23:33:54 +00004093 make_decode_exception(exceptionObject,
4094 encoding,
4095 *input, *inend - *input,
4096 *startinpos, *endinpos,
4097 reason);
4098 if (*exceptionObject == NULL)
4099 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004100
4101 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4102 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004103 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004104 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004105 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004106 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004107 }
4108 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004109 goto onError;
Benjamin Petersonbac79492012-01-14 13:34:47 -05004110 if (PyUnicode_READY(repunicode) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004111 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004112
4113 /* Copy back the bytes variables, which might have been modified by the
4114 callback */
4115 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4116 if (!inputobj)
4117 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004118 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004119 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004120 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004121 *input = PyBytes_AS_STRING(inputobj);
4122 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004123 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004124 /* we can DECREF safely, as the exception has another reference,
4125 so the object won't go away. */
4126 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004127
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004128 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004129 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004130 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004131 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4132 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004133 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004134
Victor Stinner596a6c42011-11-09 00:02:18 +01004135 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
4136 /* need more space? (at least enough for what we
4137 have+the replacement+the rest of the string (starting
4138 at the new input position), so we won't have to check space
4139 when there are no errors in the rest of the string) */
4140 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
4141 requiredsize = *outpos + replen + insize-newpos;
4142 if (requiredsize > outsize) {
4143 if (requiredsize<2*outsize)
4144 requiredsize = 2*outsize;
4145 if (unicode_resize(output, requiredsize) < 0)
4146 goto onError;
4147 }
4148 if (unicode_widen(output, PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004149 goto onError;
Victor Stinner596a6c42011-11-09 00:02:18 +01004150 copy_characters(*output, *outpos, repunicode, 0, replen);
4151 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004152 }
Victor Stinner596a6c42011-11-09 00:02:18 +01004153 else {
4154 wchar_t *repwstr;
4155 Py_ssize_t repwlen;
4156 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4157 if (repwstr == NULL)
4158 goto onError;
4159 /* need more space? (at least enough for what we
4160 have+the replacement+the rest of the string (starting
4161 at the new input position), so we won't have to check space
4162 when there are no errors in the rest of the string) */
4163 requiredsize = *outpos + repwlen + insize-newpos;
4164 if (requiredsize > outsize) {
4165 if (requiredsize < 2*outsize)
4166 requiredsize = 2*outsize;
4167 if (unicode_resize(output, requiredsize) < 0)
4168 goto onError;
4169 }
4170 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4171 *outpos += repwlen;
4172 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004173 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004174 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004175
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004176 /* we made it! */
4177 res = 0;
4178
Benjamin Peterson29060642009-01-31 22:14:21 +00004179 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004180 Py_XDECREF(restuple);
4181 return res;
4182}
4183
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004184/* --- UTF-7 Codec -------------------------------------------------------- */
4185
Antoine Pitrou244651a2009-05-04 18:56:13 +00004186/* See RFC2152 for details. We encode conservatively and decode liberally. */
4187
4188/* Three simple macros defining base-64. */
4189
4190/* Is c a base-64 character? */
4191
4192#define IS_BASE64(c) \
4193 (((c) >= 'A' && (c) <= 'Z') || \
4194 ((c) >= 'a' && (c) <= 'z') || \
4195 ((c) >= '0' && (c) <= '9') || \
4196 (c) == '+' || (c) == '/')
4197
4198/* given that c is a base-64 character, what is its base-64 value? */
4199
4200#define FROM_BASE64(c) \
4201 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4202 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4203 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4204 (c) == '+' ? 62 : 63)
4205
4206/* What is the base-64 character of the bottom 6 bits of n? */
4207
4208#define TO_BASE64(n) \
4209 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4210
4211/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4212 * decoded as itself. We are permissive on decoding; the only ASCII
4213 * byte not decoding to itself is the + which begins a base64
4214 * string. */
4215
4216#define DECODE_DIRECT(c) \
4217 ((c) <= 127 && (c) != '+')
4218
4219/* The UTF-7 encoder treats ASCII characters differently according to
4220 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4221 * the above). See RFC2152. This array identifies these different
4222 * sets:
4223 * 0 : "Set D"
4224 * alphanumeric and '(),-./:?
4225 * 1 : "Set O"
4226 * !"#$%&*;<=>@[]^_`{|}
4227 * 2 : "whitespace"
4228 * ht nl cr sp
4229 * 3 : special (must be base64 encoded)
4230 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4231 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004232
Tim Petersced69f82003-09-16 20:30:58 +00004233static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004234char utf7_category[128] = {
4235/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4236 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4237/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4238 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4239/* sp ! " # $ % & ' ( ) * + , - . / */
4240 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4241/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4242 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4243/* @ A B C D E F G H I J K L M N O */
4244 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4245/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4246 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4247/* ` a b c d e f g h i j k l m n o */
4248 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4249/* p q r s t u v w x y z { | } ~ del */
4250 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004251};
4252
Antoine Pitrou244651a2009-05-04 18:56:13 +00004253/* ENCODE_DIRECT: this character should be encoded as itself. The
4254 * answer depends on whether we are encoding set O as itself, and also
4255 * on whether we are encoding whitespace as itself. RFC2152 makes it
4256 * clear that the answers to these questions vary between
4257 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004258
Antoine Pitrou244651a2009-05-04 18:56:13 +00004259#define ENCODE_DIRECT(c, directO, directWS) \
4260 ((c) < 128 && (c) > 0 && \
4261 ((utf7_category[(c)] == 0) || \
4262 (directWS && (utf7_category[(c)] == 2)) || \
4263 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004264
Alexander Belopolsky40018472011-02-26 01:02:56 +00004265PyObject *
4266PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004267 Py_ssize_t size,
4268 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004269{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004270 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4271}
4272
Antoine Pitrou244651a2009-05-04 18:56:13 +00004273/* The decoder. The only state we preserve is our read position,
4274 * i.e. how many characters we have consumed. So if we end in the
4275 * middle of a shift sequence we have to back off the read position
4276 * and the output to the beginning of the sequence, otherwise we lose
4277 * all the shift state (seen bits, number of bits seen, high
4278 * surrogate). */
4279
Alexander Belopolsky40018472011-02-26 01:02:56 +00004280PyObject *
4281PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004282 Py_ssize_t size,
4283 const char *errors,
4284 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004285{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004286 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004287 Py_ssize_t startinpos;
4288 Py_ssize_t endinpos;
4289 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004290 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004291 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004292 const char *errmsg = "";
4293 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004294 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004295 unsigned int base64bits = 0;
4296 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004297 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004298 PyObject *errorHandler = NULL;
4299 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004300
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004301 /* Start off assuming it's all ASCII. Widen later as necessary. */
4302 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004303 if (!unicode)
4304 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004305 if (size == 0) {
4306 if (consumed)
4307 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004308 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004309 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004310
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004311 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004312 e = s + size;
4313
4314 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004315 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004316 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004317 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004318
Antoine Pitrou244651a2009-05-04 18:56:13 +00004319 if (inShift) { /* in a base-64 section */
4320 if (IS_BASE64(ch)) { /* consume a base-64 character */
4321 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4322 base64bits += 6;
4323 s++;
4324 if (base64bits >= 16) {
4325 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004326 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004327 base64bits -= 16;
4328 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4329 if (surrogate) {
4330 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004331 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4332 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004333 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
4334 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004335 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004336 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004337 }
4338 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004339 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4340 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004341 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004342 }
4343 }
Victor Stinner551ac952011-11-29 22:58:13 +01004344 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004345 /* first surrogate */
4346 surrogate = outCh;
4347 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004348 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004349 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
4350 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004351 }
4352 }
4353 }
4354 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004355 inShift = 0;
4356 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004357 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004358 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4359 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004360 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004361 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004362 if (base64bits > 0) { /* left-over bits */
4363 if (base64bits >= 6) {
4364 /* We've seen at least one base-64 character */
4365 errmsg = "partial character in shift sequence";
4366 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004367 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004368 else {
4369 /* Some bits remain; they should be zero */
4370 if (base64buffer != 0) {
4371 errmsg = "non-zero padding bits in shift sequence";
4372 goto utf7Error;
4373 }
4374 }
4375 }
4376 if (ch != '-') {
4377 /* '-' is absorbed; other terminating
4378 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004379 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4380 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004381 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004382 }
4383 }
4384 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004385 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004386 s++; /* consume '+' */
4387 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004388 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004389 if (unicode_putchar(&unicode, &outpos, '+') < 0)
4390 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004391 }
4392 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004393 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004394 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004395 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004396 }
4397 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004398 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004399 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4400 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004401 s++;
4402 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004403 else {
4404 startinpos = s-starts;
4405 s++;
4406 errmsg = "unexpected special character";
4407 goto utf7Error;
4408 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004409 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004410utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004411 endinpos = s-starts;
4412 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00004413 errors, &errorHandler,
4414 "utf7", errmsg,
4415 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004416 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004417 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004418 }
4419
Antoine Pitrou244651a2009-05-04 18:56:13 +00004420 /* end of string */
4421
4422 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4423 /* if we're in an inconsistent state, that's an error */
4424 if (surrogate ||
4425 (base64bits >= 6) ||
4426 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004427 endinpos = size;
4428 if (unicode_decode_call_errorhandler(
4429 errors, &errorHandler,
4430 "utf7", "unterminated shift sequence",
4431 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004432 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004433 goto onError;
4434 if (s < e)
4435 goto restart;
4436 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004437 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004438
4439 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004440 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004441 if (inShift) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004442 outpos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004443 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004444 }
4445 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004446 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004447 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004448 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004449
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004450 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004451 goto onError;
4452
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004453 Py_XDECREF(errorHandler);
4454 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004455 return unicode_result(unicode);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004456
Benjamin Peterson29060642009-01-31 22:14:21 +00004457 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004458 Py_XDECREF(errorHandler);
4459 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004460 Py_DECREF(unicode);
4461 return NULL;
4462}
4463
4464
Alexander Belopolsky40018472011-02-26 01:02:56 +00004465PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004466_PyUnicode_EncodeUTF7(PyObject *str,
4467 int base64SetO,
4468 int base64WhiteSpace,
4469 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004470{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004471 int kind;
4472 void *data;
4473 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004474 PyObject *v;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004475 Py_ssize_t allocated;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004476 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004477 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004478 unsigned int base64bits = 0;
4479 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004480 char * out;
4481 char * start;
4482
Benjamin Petersonbac79492012-01-14 13:34:47 -05004483 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004484 return NULL;
4485 kind = PyUnicode_KIND(str);
4486 data = PyUnicode_DATA(str);
4487 len = PyUnicode_GET_LENGTH(str);
4488
4489 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004490 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004491
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004492 /* It might be possible to tighten this worst case */
4493 allocated = 8 * len;
4494 if (allocated / 8 != len)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004495 return PyErr_NoMemory();
4496
Antoine Pitrou244651a2009-05-04 18:56:13 +00004497 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004498 if (v == NULL)
4499 return NULL;
4500
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004501 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004502 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004503 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004504
Antoine Pitrou244651a2009-05-04 18:56:13 +00004505 if (inShift) {
4506 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4507 /* shifting out */
4508 if (base64bits) { /* output remaining bits */
4509 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4510 base64buffer = 0;
4511 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004512 }
4513 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004514 /* Characters not in the BASE64 set implicitly unshift the sequence
4515 so no '-' is required, except if the character is itself a '-' */
4516 if (IS_BASE64(ch) || ch == '-') {
4517 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004518 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004519 *out++ = (char) ch;
4520 }
4521 else {
4522 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004523 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004524 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004525 else { /* not in a shift sequence */
4526 if (ch == '+') {
4527 *out++ = '+';
4528 *out++ = '-';
4529 }
4530 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4531 *out++ = (char) ch;
4532 }
4533 else {
4534 *out++ = '+';
4535 inShift = 1;
4536 goto encode_char;
4537 }
4538 }
4539 continue;
4540encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004541 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004542 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004543
Antoine Pitrou244651a2009-05-04 18:56:13 +00004544 /* code first surrogate */
4545 base64bits += 16;
4546 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4547 while (base64bits >= 6) {
4548 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4549 base64bits -= 6;
4550 }
4551 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004552 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004553 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004554 base64bits += 16;
4555 base64buffer = (base64buffer << 16) | ch;
4556 while (base64bits >= 6) {
4557 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4558 base64bits -= 6;
4559 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004560 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004561 if (base64bits)
4562 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4563 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004564 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004565 if (_PyBytes_Resize(&v, out - start) < 0)
4566 return NULL;
4567 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004568}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004569PyObject *
4570PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4571 Py_ssize_t size,
4572 int base64SetO,
4573 int base64WhiteSpace,
4574 const char *errors)
4575{
4576 PyObject *result;
4577 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4578 if (tmp == NULL)
4579 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004580 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004581 base64WhiteSpace, errors);
4582 Py_DECREF(tmp);
4583 return result;
4584}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004585
Antoine Pitrou244651a2009-05-04 18:56:13 +00004586#undef IS_BASE64
4587#undef FROM_BASE64
4588#undef TO_BASE64
4589#undef DECODE_DIRECT
4590#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004591
Guido van Rossumd57fd912000-03-10 22:53:23 +00004592/* --- UTF-8 Codec -------------------------------------------------------- */
4593
Tim Petersced69f82003-09-16 20:30:58 +00004594static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004595char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004596 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4597 illegal prefix. See RFC 3629 for details */
4598 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4599 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004600 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004601 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4602 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4603 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4604 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004605 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4606 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004607 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4608 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004609 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4610 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4611 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4612 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4613 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004614};
4615
Alexander Belopolsky40018472011-02-26 01:02:56 +00004616PyObject *
4617PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004618 Py_ssize_t size,
4619 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004620{
Walter Dörwald69652032004-09-07 20:24:22 +00004621 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4622}
4623
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004624#include "stringlib/ucs1lib.h"
4625#include "stringlib/codecs.h"
4626#include "stringlib/undef.h"
4627
4628#include "stringlib/ucs2lib.h"
4629#include "stringlib/codecs.h"
4630#include "stringlib/undef.h"
4631
4632#include "stringlib/ucs4lib.h"
4633#include "stringlib/codecs.h"
4634#include "stringlib/undef.h"
4635
Antoine Pitrouab868312009-01-10 15:40:25 +00004636/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4637#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4638
4639/* Mask to quickly check whether a C 'long' contains a
4640 non-ASCII, UTF8-encoded char. */
4641#if (SIZEOF_LONG == 8)
4642# define ASCII_CHAR_MASK 0x8080808080808080L
4643#elif (SIZEOF_LONG == 4)
4644# define ASCII_CHAR_MASK 0x80808080L
4645#else
4646# error C 'long' size should be either 4 or 8!
4647#endif
4648
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004649/* Scans a UTF-8 string and returns the maximum character to be expected
4650 and the size of the decoded unicode string.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004651
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004652 This function doesn't check for errors, these checks are performed in
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004653 PyUnicode_DecodeUTF8Stateful.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004654 */
4655static Py_UCS4
Victor Stinnera1d12bb2011-12-11 21:53:09 +01004656utf8_scanner(const unsigned char *p, Py_ssize_t string_size, Py_ssize_t *unicode_size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004657{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004658 Py_ssize_t char_count = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004659 const unsigned char *end = p + string_size;
4660 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004661
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004662 assert(unicode_size != NULL);
4663
4664 /* By having a cascade of independent loops which fallback onto each
4665 other, we minimize the amount of work done in the average loop
4666 iteration, and we also maximize the CPU's ability to predict
4667 branches correctly (because a given condition will have always the
4668 same boolean outcome except perhaps in the last iteration of the
4669 corresponding loop).
4670 In the general case this brings us rather close to decoding
4671 performance pre-PEP 393, despite the two-pass decoding.
4672
4673 Note that the pure ASCII loop is not duplicated once a non-ASCII
4674 character has been encountered. It is actually a pessimization (by
4675 a significant factor) to use this loop on text with many non-ASCII
4676 characters, and it is important to avoid bad performance on valid
4677 utf-8 data (invalid utf-8 being a different can of worms).
4678 */
4679
4680 /* ASCII */
4681 for (; p < end; ++p) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004682 /* Only check value if it's not a ASCII char... */
4683 if (*p < 0x80) {
4684 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4685 an explanation. */
4686 if (!((size_t) p & LONG_PTR_MASK)) {
4687 /* Help register allocation */
4688 register const unsigned char *_p = p;
4689 while (_p < aligned_end) {
4690 unsigned long value = *(unsigned long *) _p;
4691 if (value & ASCII_CHAR_MASK)
4692 break;
4693 _p += SIZEOF_LONG;
4694 char_count += SIZEOF_LONG;
4695 }
4696 p = _p;
4697 if (p == end)
4698 break;
4699 }
4700 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004701 if (*p < 0x80)
4702 ++char_count;
4703 else
4704 goto _ucs1loop;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004705 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004706 *unicode_size = char_count;
4707 return 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004708
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004709_ucs1loop:
4710 for (; p < end; ++p) {
4711 if (*p < 0xc4)
4712 char_count += ((*p & 0xc0) != 0x80);
4713 else
4714 goto _ucs2loop;
4715 }
4716 *unicode_size = char_count;
4717 return 255;
4718
4719_ucs2loop:
4720 for (; p < end; ++p) {
4721 if (*p < 0xf0)
4722 char_count += ((*p & 0xc0) != 0x80);
4723 else
4724 goto _ucs4loop;
4725 }
4726 *unicode_size = char_count;
4727 return 65535;
4728
4729_ucs4loop:
4730 for (; p < end; ++p) {
4731 char_count += ((*p & 0xc0) != 0x80);
4732 }
4733 *unicode_size = char_count;
4734 return 65537;
4735}
4736
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004737/* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
Victor Stinner785938e2011-12-11 20:09:03 +01004738 in case of errors. Implicit parameters: unicode, kind, data, onError.
4739 Potential resizing overallocates, so the result needs to shrink at the end.
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004740*/
Victor Stinner785938e2011-12-11 20:09:03 +01004741#define WRITE_MAYBE_FAIL(index, value) \
4742 do { \
4743 Py_ssize_t pos = index; \
4744 if (pos > PyUnicode_GET_LENGTH(unicode) && \
4745 unicode_resize(&unicode, pos + pos/8) < 0) \
4746 goto onError; \
4747 if (unicode_putchar(&unicode, &pos, value) < 0) \
4748 goto onError; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004749 } while (0)
4750
Victor Stinnerbf6e5602011-12-12 01:53:47 +01004751static PyObject *
Victor Stinner785938e2011-12-11 20:09:03 +01004752decode_utf8_errors(const char *starts,
4753 Py_ssize_t size,
4754 const char *errors,
4755 Py_ssize_t *consumed,
4756 const char *s,
4757 PyObject *unicode,
4758 Py_ssize_t i)
Walter Dörwald69652032004-09-07 20:24:22 +00004759{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004760 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004761 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004762 Py_ssize_t startinpos;
4763 Py_ssize_t endinpos;
Victor Stinner785938e2011-12-11 20:09:03 +01004764 const char *e = starts + size;
4765 const char *aligned_end;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004766 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004767 PyObject *errorHandler = NULL;
4768 PyObject *exc = NULL;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004769
Antoine Pitrouab868312009-01-10 15:40:25 +00004770 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004771
4772 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004773 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004774
4775 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004776 /* Fast path for runs of ASCII characters. Given that common UTF-8
4777 input will consist of an overwhelming majority of ASCII
4778 characters, we try to optimize for this case by checking
4779 as many characters as a C 'long' can contain.
4780 First, check if we can do an aligned read, as most CPUs have
4781 a penalty for unaligned reads.
4782 */
4783 if (!((size_t) s & LONG_PTR_MASK)) {
4784 /* Help register allocation */
4785 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004786 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004787 while (_s < aligned_end) {
4788 /* Read a whole long at a time (either 4 or 8 bytes),
4789 and do a fast unrolled copy if it only contains ASCII
4790 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004791 unsigned long value = *(unsigned long *) _s;
4792 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004793 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004794 WRITE_MAYBE_FAIL(_i+0, _s[0]);
4795 WRITE_MAYBE_FAIL(_i+1, _s[1]);
4796 WRITE_MAYBE_FAIL(_i+2, _s[2]);
4797 WRITE_MAYBE_FAIL(_i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004798#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004799 WRITE_MAYBE_FAIL(_i+4, _s[4]);
4800 WRITE_MAYBE_FAIL(_i+5, _s[5]);
4801 WRITE_MAYBE_FAIL(_i+6, _s[6]);
4802 WRITE_MAYBE_FAIL(_i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004803#endif
4804 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004805 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004806 }
4807 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004808 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004809 if (s == e)
4810 break;
4811 ch = (unsigned char)*s;
4812 }
4813 }
4814
4815 if (ch < 0x80) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004816 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004817 s++;
4818 continue;
4819 }
4820
4821 n = utf8_code_length[ch];
4822
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004823 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004824 if (consumed)
4825 break;
4826 else {
4827 errmsg = "unexpected end of data";
4828 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004829 endinpos = startinpos+1;
4830 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4831 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004832 goto utf8Error;
4833 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004834 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004835
4836 switch (n) {
4837
4838 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004839 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004840 startinpos = s-starts;
4841 endinpos = startinpos+1;
4842 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004843
4844 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004845 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004846 startinpos = s-starts;
4847 endinpos = startinpos+1;
4848 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004849
4850 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004851 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004852 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004853 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004854 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004855 goto utf8Error;
4856 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004857 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004858 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004859 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004860 break;
4861
4862 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004863 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4864 will result in surrogates in range d800-dfff. Surrogates are
4865 not valid UTF-8 so they are rejected.
4866 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4867 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004868 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004869 (s[2] & 0xc0) != 0x80 ||
4870 ((unsigned char)s[0] == 0xE0 &&
4871 (unsigned char)s[1] < 0xA0) ||
4872 ((unsigned char)s[0] == 0xED &&
4873 (unsigned char)s[1] > 0x9F)) {
4874 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004875 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004876 endinpos = startinpos + 1;
4877
4878 /* if s[1] first two bits are 1 and 0, then the invalid
4879 continuation byte is s[2], so increment endinpos by 1,
4880 if not, s[1] is invalid and endinpos doesn't need to
4881 be incremented. */
4882 if ((s[1] & 0xC0) == 0x80)
4883 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004884 goto utf8Error;
4885 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004886 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004887 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004888 WRITE_MAYBE_FAIL(i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004889 break;
4890
4891 case 4:
4892 if ((s[1] & 0xc0) != 0x80 ||
4893 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004894 (s[3] & 0xc0) != 0x80 ||
4895 ((unsigned char)s[0] == 0xF0 &&
4896 (unsigned char)s[1] < 0x90) ||
4897 ((unsigned char)s[0] == 0xF4 &&
4898 (unsigned char)s[1] > 0x8F)) {
4899 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004900 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004901 endinpos = startinpos + 1;
4902 if ((s[1] & 0xC0) == 0x80) {
4903 endinpos++;
4904 if ((s[2] & 0xC0) == 0x80)
4905 endinpos++;
4906 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004907 goto utf8Error;
4908 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004909 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004910 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01004911 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Ezio Melotti57221d02010-07-01 07:32:02 +00004912
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004913 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004914 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004915 }
4916 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004917 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004918
Benjamin Peterson29060642009-01-31 22:14:21 +00004919 utf8Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00004920 if (unicode_decode_call_errorhandler(
4921 errors, &errorHandler,
Victor Stinnercbe01342012-02-14 01:17:45 +01004922 "utf-8", errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00004923 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004924 &unicode, &i))
Benjamin Peterson29060642009-01-31 22:14:21 +00004925 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004926 /* Update data because unicode_decode_call_errorhandler might have
4927 re-created or resized the unicode object. */
Benjamin Peterson29060642009-01-31 22:14:21 +00004928 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004929 }
Walter Dörwald69652032004-09-07 20:24:22 +00004930 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004931 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004932
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004933 /* Adjust length and ready string when it contained errors and
4934 is of the old resizable kind. */
Victor Stinner785938e2011-12-11 20:09:03 +01004935 if (unicode_resize(&unicode, i) < 0)
4936 goto onError;
4937 unicode_adjust_maxchar(&unicode);
4938 if (unicode == NULL)
4939 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004940
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004941 Py_XDECREF(errorHandler);
4942 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004943 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01004944 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004945
Benjamin Peterson29060642009-01-31 22:14:21 +00004946 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004947 Py_XDECREF(errorHandler);
4948 Py_XDECREF(exc);
Victor Stinner785938e2011-12-11 20:09:03 +01004949 Py_XDECREF(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004950 return NULL;
4951}
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004952#undef WRITE_MAYBE_FAIL
Antoine Pitrouab868312009-01-10 15:40:25 +00004953
Victor Stinner785938e2011-12-11 20:09:03 +01004954PyObject *
4955PyUnicode_DecodeUTF8Stateful(const char *s,
4956 Py_ssize_t size,
4957 const char *errors,
4958 Py_ssize_t *consumed)
4959{
4960 Py_UCS4 maxchar = 0;
4961 Py_ssize_t unicode_size;
4962 int has_errors = 0;
4963 PyObject *unicode;
4964 int kind;
4965 void *data;
4966 const char *starts = s;
4967 const char *e;
4968 Py_ssize_t i;
4969
4970 if (size == 0) {
4971 if (consumed)
4972 *consumed = 0;
Victor Stinner382955f2011-12-11 21:44:00 +01004973 Py_INCREF(unicode_empty);
4974 return unicode_empty;
Victor Stinner785938e2011-12-11 20:09:03 +01004975 }
4976
Victor Stinnera1d12bb2011-12-11 21:53:09 +01004977 maxchar = utf8_scanner((const unsigned char *)s, size, &unicode_size);
Victor Stinner785938e2011-12-11 20:09:03 +01004978
4979 /* When the string is ASCII only, just use memcpy and return.
4980 unicode_size may be != size if there is an incomplete UTF-8
4981 sequence at the end of the ASCII block. */
4982 if (maxchar < 128 && size == unicode_size) {
4983 if (consumed)
4984 *consumed = size;
Victor Stinnerab870212011-12-17 22:39:43 +01004985 return unicode_fromascii((const unsigned char *)s, size);
Victor Stinner785938e2011-12-11 20:09:03 +01004986 }
4987
4988 unicode = PyUnicode_New(unicode_size, maxchar);
4989 if (!unicode)
4990 return NULL;
4991 kind = PyUnicode_KIND(unicode);
4992 data = PyUnicode_DATA(unicode);
4993
4994 /* Unpack UTF-8 encoded data */
4995 i = 0;
4996 e = starts + size;
4997 switch (kind) {
4998 case PyUnicode_1BYTE_KIND:
4999 has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
5000 break;
5001 case PyUnicode_2BYTE_KIND:
5002 has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
5003 break;
5004 case PyUnicode_4BYTE_KIND:
5005 has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
5006 break;
5007 }
5008 if (!has_errors) {
5009 /* Ensure the unicode size calculation was correct */
5010 assert(i == unicode_size);
5011 assert(s == e);
5012 if (consumed)
5013 *consumed = size;
5014 return unicode;
5015 }
5016
5017 /* In case of errors, maxchar and size computation might be incorrect;
5018 code below refits and resizes as necessary. */
5019 return decode_utf8_errors(starts, size, errors, consumed, s, unicode, i);
5020}
5021
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005022#ifdef __APPLE__
5023
5024/* Simplified UTF-8 decoder using surrogateescape error handler,
5025 used to decode the command line arguments on Mac OS X. */
5026
5027wchar_t*
5028_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
5029{
5030 int n;
5031 const char *e;
5032 wchar_t *unicode, *p;
5033
5034 /* Note: size will always be longer than the resulting Unicode
5035 character count */
5036 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
5037 PyErr_NoMemory();
5038 return NULL;
5039 }
5040 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
5041 if (!unicode)
5042 return NULL;
5043
5044 /* Unpack UTF-8 encoded data */
5045 p = unicode;
5046 e = s + size;
5047 while (s < e) {
5048 Py_UCS4 ch = (unsigned char)*s;
5049
5050 if (ch < 0x80) {
5051 *p++ = (wchar_t)ch;
5052 s++;
5053 continue;
5054 }
5055
5056 n = utf8_code_length[ch];
5057 if (s + n > e) {
5058 goto surrogateescape;
5059 }
5060
5061 switch (n) {
5062 case 0:
5063 case 1:
5064 goto surrogateescape;
5065
5066 case 2:
5067 if ((s[1] & 0xc0) != 0x80)
5068 goto surrogateescape;
5069 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
5070 assert ((ch > 0x007F) && (ch <= 0x07FF));
5071 *p++ = (wchar_t)ch;
5072 break;
5073
5074 case 3:
5075 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
5076 will result in surrogates in range d800-dfff. Surrogates are
5077 not valid UTF-8 so they are rejected.
5078 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
5079 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
5080 if ((s[1] & 0xc0) != 0x80 ||
5081 (s[2] & 0xc0) != 0x80 ||
5082 ((unsigned char)s[0] == 0xE0 &&
5083 (unsigned char)s[1] < 0xA0) ||
5084 ((unsigned char)s[0] == 0xED &&
5085 (unsigned char)s[1] > 0x9F)) {
5086
5087 goto surrogateescape;
5088 }
5089 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5090 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005091 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005092 break;
5093
5094 case 4:
5095 if ((s[1] & 0xc0) != 0x80 ||
5096 (s[2] & 0xc0) != 0x80 ||
5097 (s[3] & 0xc0) != 0x80 ||
5098 ((unsigned char)s[0] == 0xF0 &&
5099 (unsigned char)s[1] < 0x90) ||
5100 ((unsigned char)s[0] == 0xF4 &&
5101 (unsigned char)s[1] > 0x8F)) {
5102 goto surrogateescape;
5103 }
5104 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
5105 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01005106 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005107
5108#if SIZEOF_WCHAR_T == 4
5109 *p++ = (wchar_t)ch;
5110#else
5111 /* compute and append the two surrogates: */
Victor Stinner551ac952011-11-29 22:58:13 +01005112 *p++ = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5113 *p++ = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005114#endif
5115 break;
5116 }
5117 s += n;
5118 continue;
5119
5120 surrogateescape:
5121 *p++ = 0xDC00 + ch;
5122 s++;
5123 }
5124 *p = L'\0';
5125 return unicode;
5126}
5127
5128#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00005129
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005130/* Primary internal function which creates utf8 encoded bytes objects.
5131
5132 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005133 and allocate exactly as much space needed at the end. Else allocate the
5134 maximum possible needed (4 result bytes per Unicode character), and return
5135 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005136*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005137PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005138_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005139{
Victor Stinner6099a032011-12-18 14:22:26 +01005140 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005141 void *data;
5142 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005143
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005144 if (!PyUnicode_Check(unicode)) {
5145 PyErr_BadArgument();
5146 return NULL;
5147 }
5148
5149 if (PyUnicode_READY(unicode) == -1)
5150 return NULL;
5151
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005152 if (PyUnicode_UTF8(unicode))
5153 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5154 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005155
5156 kind = PyUnicode_KIND(unicode);
5157 data = PyUnicode_DATA(unicode);
5158 size = PyUnicode_GET_LENGTH(unicode);
5159
Benjamin Petersonead6b532011-12-20 17:23:42 -06005160 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005161 default:
5162 assert(0);
5163 case PyUnicode_1BYTE_KIND:
5164 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5165 assert(!PyUnicode_IS_ASCII(unicode));
5166 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5167 case PyUnicode_2BYTE_KIND:
5168 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5169 case PyUnicode_4BYTE_KIND:
5170 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005171 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005172}
5173
Alexander Belopolsky40018472011-02-26 01:02:56 +00005174PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005175PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5176 Py_ssize_t size,
5177 const char *errors)
5178{
5179 PyObject *v, *unicode;
5180
5181 unicode = PyUnicode_FromUnicode(s, size);
5182 if (unicode == NULL)
5183 return NULL;
5184 v = _PyUnicode_AsUTF8String(unicode, errors);
5185 Py_DECREF(unicode);
5186 return v;
5187}
5188
5189PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005190PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005191{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005192 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005193}
5194
Walter Dörwald41980ca2007-08-16 21:55:45 +00005195/* --- UTF-32 Codec ------------------------------------------------------- */
5196
5197PyObject *
5198PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005199 Py_ssize_t size,
5200 const char *errors,
5201 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005202{
5203 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5204}
5205
5206PyObject *
5207PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005208 Py_ssize_t size,
5209 const char *errors,
5210 int *byteorder,
5211 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005212{
5213 const char *starts = s;
5214 Py_ssize_t startinpos;
5215 Py_ssize_t endinpos;
5216 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005217 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005218 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005219 int bo = 0; /* assume native ordering by default */
5220 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005221 /* Offsets from q for retrieving bytes in the right order. */
5222#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5223 int iorder[] = {0, 1, 2, 3};
5224#else
5225 int iorder[] = {3, 2, 1, 0};
5226#endif
5227 PyObject *errorHandler = NULL;
5228 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005229
Walter Dörwald41980ca2007-08-16 21:55:45 +00005230 q = (unsigned char *)s;
5231 e = q + size;
5232
5233 if (byteorder)
5234 bo = *byteorder;
5235
5236 /* Check for BOM marks (U+FEFF) in the input and adjust current
5237 byte order setting accordingly. In native mode, the leading BOM
5238 mark is skipped, in all other modes, it is copied to the output
5239 stream as-is (giving a ZWNBSP character). */
5240 if (bo == 0) {
5241 if (size >= 4) {
5242 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00005243 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005244#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005245 if (bom == 0x0000FEFF) {
5246 q += 4;
5247 bo = -1;
5248 }
5249 else if (bom == 0xFFFE0000) {
5250 q += 4;
5251 bo = 1;
5252 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005253#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005254 if (bom == 0x0000FEFF) {
5255 q += 4;
5256 bo = 1;
5257 }
5258 else if (bom == 0xFFFE0000) {
5259 q += 4;
5260 bo = -1;
5261 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005262#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005263 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005264 }
5265
5266 if (bo == -1) {
5267 /* force LE */
5268 iorder[0] = 0;
5269 iorder[1] = 1;
5270 iorder[2] = 2;
5271 iorder[3] = 3;
5272 }
5273 else if (bo == 1) {
5274 /* force BE */
5275 iorder[0] = 3;
5276 iorder[1] = 2;
5277 iorder[2] = 1;
5278 iorder[3] = 0;
5279 }
5280
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005281 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005282 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005283 if (!unicode)
5284 return NULL;
5285 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005286 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005287 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005288
Walter Dörwald41980ca2007-08-16 21:55:45 +00005289 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005290 Py_UCS4 ch;
5291 /* remaining bytes at the end? (size should be divisible by 4) */
5292 if (e-q<4) {
5293 if (consumed)
5294 break;
5295 errmsg = "truncated data";
5296 startinpos = ((const char *)q)-starts;
5297 endinpos = ((const char *)e)-starts;
5298 goto utf32Error;
5299 /* The remaining input chars are ignored if the callback
5300 chooses to skip the input */
5301 }
5302 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5303 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005304
Benjamin Peterson29060642009-01-31 22:14:21 +00005305 if (ch >= 0x110000)
5306 {
5307 errmsg = "codepoint not in range(0x110000)";
5308 startinpos = ((const char *)q)-starts;
5309 endinpos = startinpos+4;
5310 goto utf32Error;
5311 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005312 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5313 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005314 q += 4;
5315 continue;
5316 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005317 if (unicode_decode_call_errorhandler(
5318 errors, &errorHandler,
5319 "utf32", errmsg,
5320 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005321 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005322 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005323 }
5324
5325 if (byteorder)
5326 *byteorder = bo;
5327
5328 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005329 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005330
5331 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005332 if (unicode_resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005333 goto onError;
5334
5335 Py_XDECREF(errorHandler);
5336 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005337 return unicode_result(unicode);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005338
Benjamin Peterson29060642009-01-31 22:14:21 +00005339 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005340 Py_DECREF(unicode);
5341 Py_XDECREF(errorHandler);
5342 Py_XDECREF(exc);
5343 return NULL;
5344}
5345
5346PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005347_PyUnicode_EncodeUTF32(PyObject *str,
5348 const char *errors,
5349 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005350{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005351 int kind;
5352 void *data;
5353 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005354 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005355 unsigned char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005356 Py_ssize_t nsize, bytesize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005357 /* Offsets from p for storing byte pairs in the right order. */
5358#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5359 int iorder[] = {0, 1, 2, 3};
5360#else
5361 int iorder[] = {3, 2, 1, 0};
5362#endif
5363
Benjamin Peterson29060642009-01-31 22:14:21 +00005364#define STORECHAR(CH) \
5365 do { \
5366 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5367 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5368 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5369 p[iorder[0]] = (CH) & 0xff; \
5370 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005371 } while(0)
5372
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005373 if (!PyUnicode_Check(str)) {
5374 PyErr_BadArgument();
5375 return NULL;
5376 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005377 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005378 return NULL;
5379 kind = PyUnicode_KIND(str);
5380 data = PyUnicode_DATA(str);
5381 len = PyUnicode_GET_LENGTH(str);
5382
5383 nsize = len + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005384 bytesize = nsize * 4;
5385 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005386 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005387 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005388 if (v == NULL)
5389 return NULL;
5390
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005391 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005392 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005393 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005394 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005395 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005396
5397 if (byteorder == -1) {
5398 /* force LE */
5399 iorder[0] = 0;
5400 iorder[1] = 1;
5401 iorder[2] = 2;
5402 iorder[3] = 3;
5403 }
5404 else if (byteorder == 1) {
5405 /* force BE */
5406 iorder[0] = 3;
5407 iorder[1] = 2;
5408 iorder[2] = 1;
5409 iorder[3] = 0;
5410 }
5411
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005412 for (i = 0; i < len; i++)
5413 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005414
5415 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005416 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005417#undef STORECHAR
5418}
5419
Alexander Belopolsky40018472011-02-26 01:02:56 +00005420PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005421PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5422 Py_ssize_t size,
5423 const char *errors,
5424 int byteorder)
5425{
5426 PyObject *result;
5427 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5428 if (tmp == NULL)
5429 return NULL;
5430 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5431 Py_DECREF(tmp);
5432 return result;
5433}
5434
5435PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005436PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005437{
Victor Stinnerb960b342011-11-20 19:12:52 +01005438 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005439}
5440
Guido van Rossumd57fd912000-03-10 22:53:23 +00005441/* --- UTF-16 Codec ------------------------------------------------------- */
5442
Tim Peters772747b2001-08-09 22:21:55 +00005443PyObject *
5444PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005445 Py_ssize_t size,
5446 const char *errors,
5447 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005448{
Walter Dörwald69652032004-09-07 20:24:22 +00005449 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5450}
5451
Antoine Pitrouab868312009-01-10 15:40:25 +00005452/* Two masks for fast checking of whether a C 'long' may contain
5453 UTF16-encoded surrogate characters. This is an efficient heuristic,
5454 assuming that non-surrogate characters with a code point >= 0x8000 are
5455 rare in most input.
5456 FAST_CHAR_MASK is used when the input is in native byte ordering,
5457 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005458*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005459#if (SIZEOF_LONG == 8)
5460# define FAST_CHAR_MASK 0x8000800080008000L
5461# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
Victor Stinnerafb52052012-04-05 22:54:49 +02005462# define STRIPPED_MASK 0x00FF00FF00FF00FFL
Antoine Pitrouab868312009-01-10 15:40:25 +00005463#elif (SIZEOF_LONG == 4)
5464# define FAST_CHAR_MASK 0x80008000L
5465# define SWAPPED_FAST_CHAR_MASK 0x00800080L
Victor Stinnerafb52052012-04-05 22:54:49 +02005466# define STRIPPED_MASK 0x00FF00FFL
Antoine Pitrouab868312009-01-10 15:40:25 +00005467#else
5468# error C 'long' size should be either 4 or 8!
5469#endif
5470
Walter Dörwald69652032004-09-07 20:24:22 +00005471PyObject *
5472PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005473 Py_ssize_t size,
5474 const char *errors,
5475 int *byteorder,
5476 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005477{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005478 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005479 Py_ssize_t startinpos;
5480 Py_ssize_t endinpos;
5481 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005482 PyObject *unicode;
Antoine Pitrouab868312009-01-10 15:40:25 +00005483 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005484 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005485 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005486 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005487 /* Offsets from q for retrieving byte pairs in the right order. */
5488#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5489 int ihi = 1, ilo = 0;
5490#else
5491 int ihi = 0, ilo = 1;
5492#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005493 PyObject *errorHandler = NULL;
5494 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005495
5496 /* Note: size will always be longer than the resulting Unicode
5497 character count */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005498 unicode = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005499 if (!unicode)
5500 return NULL;
5501 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005502 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005503 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005504
Tim Peters772747b2001-08-09 22:21:55 +00005505 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005506 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005507
5508 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005509 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005510
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005511 /* Check for BOM marks (U+FEFF) in the input and adjust current
5512 byte order setting accordingly. In native mode, the leading BOM
5513 mark is skipped, in all other modes, it is copied to the output
5514 stream as-is (giving a ZWNBSP character). */
5515 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005516 if (size >= 2) {
Victor Stinner24729f32011-11-10 20:31:37 +01005517 const Py_UCS4 bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005518#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005519 if (bom == 0xFEFF) {
5520 q += 2;
5521 bo = -1;
5522 }
5523 else if (bom == 0xFFFE) {
5524 q += 2;
5525 bo = 1;
5526 }
Tim Petersced69f82003-09-16 20:30:58 +00005527#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005528 if (bom == 0xFEFF) {
5529 q += 2;
5530 bo = 1;
5531 }
5532 else if (bom == 0xFFFE) {
5533 q += 2;
5534 bo = -1;
5535 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005536#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005537 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005538 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005539
Tim Peters772747b2001-08-09 22:21:55 +00005540 if (bo == -1) {
5541 /* force LE */
5542 ihi = 1;
5543 ilo = 0;
5544 }
5545 else if (bo == 1) {
5546 /* force BE */
5547 ihi = 0;
5548 ilo = 1;
5549 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005550#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5551 native_ordering = ilo < ihi;
5552#else
5553 native_ordering = ilo > ihi;
5554#endif
Tim Peters772747b2001-08-09 22:21:55 +00005555
Antoine Pitrouab868312009-01-10 15:40:25 +00005556 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005557 while (q < e) {
Victor Stinner24729f32011-11-10 20:31:37 +01005558 Py_UCS4 ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005559 /* First check for possible aligned read of a C 'long'. Unaligned
5560 reads are more expensive, better to defer to another iteration. */
5561 if (!((size_t) q & LONG_PTR_MASK)) {
5562 /* Fast path for runs of non-surrogate chars. */
5563 register const unsigned char *_q = q;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005564 int kind = PyUnicode_KIND(unicode);
5565 void *data = PyUnicode_DATA(unicode);
5566 while (_q < aligned_end) {
Victor Stinnerafb52052012-04-05 22:54:49 +02005567 unsigned long block = * (unsigned long *) _q;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005568 Py_UCS4 maxch;
5569 if (native_ordering) {
5570 /* Can use buffer directly */
Victor Stinnerafb52052012-04-05 22:54:49 +02005571 if (block & FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005572 break;
Antoine Pitrouab868312009-01-10 15:40:25 +00005573 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005574 else {
5575 /* Need to byte-swap */
Victor Stinnerafb52052012-04-05 22:54:49 +02005576 if (block & SWAPPED_FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005577 break;
Victor Stinnerafb52052012-04-05 22:54:49 +02005578 block = ((block >> 8) & STRIPPED_MASK) |
5579 ((block & STRIPPED_MASK) << 8);
Antoine Pitrouab868312009-01-10 15:40:25 +00005580 }
Victor Stinnerafb52052012-04-05 22:54:49 +02005581 maxch = (Py_UCS2)(block & 0xFFFF);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005582#if SIZEOF_LONG == 8
Victor Stinnerafb52052012-04-05 22:54:49 +02005583 ch = (Py_UCS2)((block >> 16) & 0xFFFF);
Victor Stinnere6abb482012-05-02 01:15:40 +02005584 maxch = MAX_MAXCHAR(maxch, ch);
Victor Stinnerafb52052012-04-05 22:54:49 +02005585 ch = (Py_UCS2)((block >> 32) & 0xFFFF);
Victor Stinnere6abb482012-05-02 01:15:40 +02005586 maxch = MAX_MAXCHAR(maxch, ch);
Victor Stinnerafb52052012-04-05 22:54:49 +02005587 ch = (Py_UCS2)(block >> 48);
Victor Stinnere6abb482012-05-02 01:15:40 +02005588 maxch = MAX_MAXCHAR(maxch, ch);
Victor Stinnerafb52052012-04-05 22:54:49 +02005589#else
5590 ch = (Py_UCS2)(block >> 16);
Victor Stinnere6abb482012-05-02 01:15:40 +02005591 maxch = MAX_MAXCHAR(maxch, ch);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005592#endif
5593 if (maxch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
5594 if (unicode_widen(&unicode, maxch) < 0)
5595 goto onError;
5596 kind = PyUnicode_KIND(unicode);
5597 data = PyUnicode_DATA(unicode);
5598 }
Victor Stinnerafb52052012-04-05 22:54:49 +02005599#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5600 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block & 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005601#if SIZEOF_LONG == 8
Victor Stinnerafb52052012-04-05 22:54:49 +02005602 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 16) & 0xFFFF));
5603 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 32) & 0xFFFF));
5604 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 48)));
5605#else
5606 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block >> 16));
5607#endif
5608#else
5609#if SIZEOF_LONG == 8
5610 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 48)));
5611 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 32) & 0xFFFF));
5612 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 16) & 0xFFFF));
5613#else
5614 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block >> 16));
5615#endif
5616 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block & 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005617#endif
5618 _q += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00005619 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005620 q = _q;
5621 if (q >= e)
5622 break;
5623 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005624 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005625
Benjamin Peterson14339b62009-01-31 16:36:08 +00005626 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005627
Victor Stinner551ac952011-11-29 22:58:13 +01005628 if (!Py_UNICODE_IS_SURROGATE(ch)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005629 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5630 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005631 continue;
5632 }
5633
5634 /* UTF-16 code pair: */
5635 if (q > e) {
5636 errmsg = "unexpected end of data";
5637 startinpos = (((const char *)q) - 2) - starts;
5638 endinpos = ((const char *)e) + 1 - starts;
5639 goto utf16Error;
5640 }
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005641 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)) {
5642 Py_UCS4 ch2 = (q[ihi] << 8) | q[ilo];
Benjamin Peterson29060642009-01-31 22:14:21 +00005643 q += 2;
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005644 if (Py_UNICODE_IS_LOW_SURROGATE(ch2)) {
Victor Stinner62aa4d02011-11-09 00:03:45 +01005645 if (unicode_putchar(&unicode, &outpos,
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005646 Py_UNICODE_JOIN_SURROGATES(ch, ch2)) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005647 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005648 continue;
5649 }
5650 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005651 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005652 startinpos = (((const char *)q)-4)-starts;
5653 endinpos = startinpos+2;
5654 goto utf16Error;
5655 }
5656
Benjamin Peterson14339b62009-01-31 16:36:08 +00005657 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005658 errmsg = "illegal encoding";
5659 startinpos = (((const char *)q)-2)-starts;
5660 endinpos = startinpos+2;
5661 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005662
Benjamin Peterson29060642009-01-31 22:14:21 +00005663 utf16Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005664 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005665 errors,
5666 &errorHandler,
5667 "utf16", errmsg,
5668 &starts,
5669 (const char **)&e,
5670 &startinpos,
5671 &endinpos,
5672 &exc,
5673 (const char **)&q,
5674 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005675 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005676 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005677 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005678 /* remaining byte at the end? (size should be even) */
5679 if (e == q) {
5680 if (!consumed) {
5681 errmsg = "truncated data";
5682 startinpos = ((const char *)q) - starts;
5683 endinpos = ((const char *)e) + 1 - starts;
Antoine Pitrouab868312009-01-10 15:40:25 +00005684 if (unicode_decode_call_errorhandler(
5685 errors,
5686 &errorHandler,
5687 "utf16", errmsg,
5688 &starts,
5689 (const char **)&e,
5690 &startinpos,
5691 &endinpos,
5692 &exc,
5693 (const char **)&q,
5694 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005695 &outpos))
Antoine Pitrouab868312009-01-10 15:40:25 +00005696 goto onError;
5697 /* The remaining input chars are ignored if the callback
5698 chooses to skip the input */
5699 }
5700 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005701
5702 if (byteorder)
5703 *byteorder = bo;
5704
Walter Dörwald69652032004-09-07 20:24:22 +00005705 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005706 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005707
Guido van Rossumd57fd912000-03-10 22:53:23 +00005708 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005709 if (unicode_resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005710 goto onError;
5711
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005712 Py_XDECREF(errorHandler);
5713 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005714 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005715
Benjamin Peterson29060642009-01-31 22:14:21 +00005716 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005717 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005718 Py_XDECREF(errorHandler);
5719 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005720 return NULL;
5721}
5722
Antoine Pitrouab868312009-01-10 15:40:25 +00005723#undef FAST_CHAR_MASK
5724#undef SWAPPED_FAST_CHAR_MASK
5725
Tim Peters772747b2001-08-09 22:21:55 +00005726PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005727_PyUnicode_EncodeUTF16(PyObject *str,
5728 const char *errors,
5729 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005730{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005731 int kind;
5732 void *data;
5733 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005734 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005735 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005736 Py_ssize_t nsize, bytesize;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005737 Py_ssize_t i, pairs;
Tim Peters772747b2001-08-09 22:21:55 +00005738 /* Offsets from p for storing byte pairs in the right order. */
5739#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5740 int ihi = 1, ilo = 0;
5741#else
5742 int ihi = 0, ilo = 1;
5743#endif
5744
Benjamin Peterson29060642009-01-31 22:14:21 +00005745#define STORECHAR(CH) \
5746 do { \
5747 p[ihi] = ((CH) >> 8) & 0xff; \
5748 p[ilo] = (CH) & 0xff; \
5749 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005750 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005751
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005752 if (!PyUnicode_Check(str)) {
5753 PyErr_BadArgument();
5754 return NULL;
5755 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005756 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005757 return NULL;
5758 kind = PyUnicode_KIND(str);
5759 data = PyUnicode_DATA(str);
5760 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005761
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005762 pairs = 0;
5763 if (kind == PyUnicode_4BYTE_KIND)
5764 for (i = 0; i < len; i++)
5765 if (PyUnicode_READ(kind, data, i) >= 0x10000)
5766 pairs++;
5767 /* 2 * (len + pairs + (byteorder == 0)) */
5768 if (len > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005769 return PyErr_NoMemory();
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005770 nsize = len + pairs + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005771 bytesize = nsize * 2;
5772 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005773 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005774 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005775 if (v == NULL)
5776 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005777
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005778 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005779 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005780 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005781 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005782 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005783
5784 if (byteorder == -1) {
5785 /* force LE */
5786 ihi = 1;
5787 ilo = 0;
5788 }
5789 else if (byteorder == 1) {
5790 /* force BE */
5791 ihi = 0;
5792 ilo = 1;
5793 }
5794
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005795 for (i = 0; i < len; i++) {
5796 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5797 Py_UCS4 ch2 = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +00005798 if (ch >= 0x10000) {
Victor Stinner551ac952011-11-29 22:58:13 +01005799 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
5800 ch = Py_UNICODE_HIGH_SURROGATE(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00005801 }
Tim Peters772747b2001-08-09 22:21:55 +00005802 STORECHAR(ch);
5803 if (ch2)
5804 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005805 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005806
5807 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005808 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005809#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005810}
5811
Alexander Belopolsky40018472011-02-26 01:02:56 +00005812PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005813PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5814 Py_ssize_t size,
5815 const char *errors,
5816 int byteorder)
5817{
5818 PyObject *result;
5819 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5820 if (tmp == NULL)
5821 return NULL;
5822 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5823 Py_DECREF(tmp);
5824 return result;
5825}
5826
5827PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005828PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005829{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005830 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005831}
5832
5833/* --- Unicode Escape Codec ----------------------------------------------- */
5834
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005835/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5836 if all the escapes in the string make it still a valid ASCII string.
5837 Returns -1 if any escapes were found which cause the string to
5838 pop out of ASCII range. Otherwise returns the length of the
5839 required buffer to hold the string.
5840 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005841static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005842length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5843{
5844 const unsigned char *p = (const unsigned char *)s;
5845 const unsigned char *end = p + size;
5846 Py_ssize_t length = 0;
5847
5848 if (size < 0)
5849 return -1;
5850
5851 for (; p < end; ++p) {
5852 if (*p > 127) {
5853 /* Non-ASCII */
5854 return -1;
5855 }
5856 else if (*p != '\\') {
5857 /* Normal character */
5858 ++length;
5859 }
5860 else {
5861 /* Backslash-escape, check next char */
5862 ++p;
5863 /* Escape sequence reaches till end of string or
5864 non-ASCII follow-up. */
5865 if (p >= end || *p > 127)
5866 return -1;
5867 switch (*p) {
5868 case '\n':
5869 /* backslash + \n result in zero characters */
5870 break;
5871 case '\\': case '\'': case '\"':
5872 case 'b': case 'f': case 't':
5873 case 'n': case 'r': case 'v': case 'a':
5874 ++length;
5875 break;
5876 case '0': case '1': case '2': case '3':
5877 case '4': case '5': case '6': case '7':
5878 case 'x': case 'u': case 'U': case 'N':
5879 /* these do not guarantee ASCII characters */
5880 return -1;
5881 default:
5882 /* count the backslash + the other character */
5883 length += 2;
5884 }
5885 }
5886 }
5887 return length;
5888}
5889
Fredrik Lundh06d12682001-01-24 07:59:11 +00005890static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005891
Alexander Belopolsky40018472011-02-26 01:02:56 +00005892PyObject *
5893PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005894 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005895 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005896{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005897 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005898 Py_ssize_t startinpos;
5899 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005900 int j;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005901 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005902 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005903 char* message;
5904 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005905 PyObject *errorHandler = NULL;
5906 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005907 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005908 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005909
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005910 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005911
5912 /* After length_of_escaped_ascii_string() there are two alternatives,
5913 either the string is pure ASCII with named escapes like \n, etc.
5914 and we determined it's exact size (common case)
5915 or it contains \x, \u, ... escape sequences. then we create a
5916 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005917 if (len >= 0) {
5918 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005919 if (!v)
5920 goto onError;
5921 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005922 }
5923 else {
5924 /* Escaped strings will always be longer than the resulting
5925 Unicode string, so we start with size here and then reduce the
5926 length after conversion to the true value.
5927 (but if the error callback returns a long replacement string
5928 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005929 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005930 if (!v)
5931 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005932 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005933 }
5934
Guido van Rossumd57fd912000-03-10 22:53:23 +00005935 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005936 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005937 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005938 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005939
Guido van Rossumd57fd912000-03-10 22:53:23 +00005940 while (s < end) {
5941 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005942 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005943 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005944
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005945 /* The only case in which i == ascii_length is a backslash
5946 followed by a newline. */
5947 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005948
Guido van Rossumd57fd912000-03-10 22:53:23 +00005949 /* Non-escape characters are interpreted as Unicode ordinals */
5950 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005951 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5952 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005953 continue;
5954 }
5955
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005956 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005957 /* \ - Escapes */
5958 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005959 c = *s++;
5960 if (s > end)
5961 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005962
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005963 /* The only case in which i == ascii_length is a backslash
5964 followed by a newline. */
5965 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005966
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005967 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005968
Benjamin Peterson29060642009-01-31 22:14:21 +00005969 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005970#define WRITECHAR(ch) \
5971 do { \
5972 if (unicode_putchar(&v, &i, ch) < 0) \
5973 goto onError; \
5974 }while(0)
5975
Guido van Rossumd57fd912000-03-10 22:53:23 +00005976 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005977 case '\\': WRITECHAR('\\'); break;
5978 case '\'': WRITECHAR('\''); break;
5979 case '\"': WRITECHAR('\"'); break;
5980 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005981 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005982 case 'f': WRITECHAR('\014'); break;
5983 case 't': WRITECHAR('\t'); break;
5984 case 'n': WRITECHAR('\n'); break;
5985 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005986 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005987 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005988 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005989 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005990
Benjamin Peterson29060642009-01-31 22:14:21 +00005991 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005992 case '0': case '1': case '2': case '3':
5993 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005994 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005995 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005996 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005997 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005998 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005999 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006000 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006001 break;
6002
Benjamin Peterson29060642009-01-31 22:14:21 +00006003 /* hex escapes */
6004 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006005 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006006 digits = 2;
6007 message = "truncated \\xXX escape";
6008 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006009
Benjamin Peterson29060642009-01-31 22:14:21 +00006010 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006011 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006012 digits = 4;
6013 message = "truncated \\uXXXX escape";
6014 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006015
Benjamin Peterson29060642009-01-31 22:14:21 +00006016 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006017 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006018 digits = 8;
6019 message = "truncated \\UXXXXXXXX escape";
6020 hexescape:
6021 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006022 if (s+digits>end) {
6023 endinpos = size;
6024 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00006025 errors, &errorHandler,
6026 "unicodeescape", "end of string in escape sequence",
6027 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006028 &v, &i))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006029 goto onError;
6030 goto nextByte;
6031 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006032 for (j = 0; j < digits; ++j) {
6033 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00006034 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006035 endinpos = (s+j+1)-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006036 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00006037 errors, &errorHandler,
6038 "unicodeescape", message,
6039 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006040 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00006041 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006042 len = PyUnicode_GET_LENGTH(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006043 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00006044 }
6045 chr = (chr<<4) & ~0xF;
6046 if (c >= '0' && c <= '9')
6047 chr += c - '0';
6048 else if (c >= 'a' && c <= 'f')
6049 chr += 10 + c - 'a';
6050 else
6051 chr += 10 + c - 'A';
6052 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006053 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00006054 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006055 /* _decoding_error will have already written into the
6056 target buffer. */
6057 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006058 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00006059 /* when we get here, chr is a 32-bit unicode character */
Victor Stinner8faf8212011-12-08 22:14:11 +01006060 if (chr <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006061 WRITECHAR(chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00006062 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006063 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006064 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00006065 errors, &errorHandler,
6066 "unicodeescape", "illegal Unicode character",
6067 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006068 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00006069 goto onError;
6070 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006071 break;
6072
Benjamin Peterson29060642009-01-31 22:14:21 +00006073 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006074 case 'N':
6075 message = "malformed \\N character escape";
6076 if (ucnhash_CAPI == NULL) {
6077 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006078 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6079 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00006080 if (ucnhash_CAPI == NULL)
6081 goto ucnhashError;
6082 }
6083 if (*s == '{') {
6084 const char *start = s+1;
6085 /* look for the closing brace */
6086 while (*s != '}' && s < end)
6087 s++;
6088 if (s > start && s < end && *s == '}') {
6089 /* found a name. look it up in the unicode database */
6090 message = "unknown Unicode character name";
6091 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006092 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03006093 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00006094 goto store;
6095 }
6096 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006097 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006098 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00006099 errors, &errorHandler,
6100 "unicodeescape", message,
6101 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006102 &v, &i))
Fredrik Lundhccc74732001-02-18 22:13:49 +00006103 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006104 break;
6105
6106 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00006107 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006108 message = "\\ at end of string";
6109 s--;
6110 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006111 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00006112 errors, &errorHandler,
6113 "unicodeescape", message,
6114 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006115 &v, &i))
Walter Dörwald8c077222002-03-25 11:16:18 +00006116 goto onError;
6117 }
6118 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006119 WRITECHAR('\\');
6120 WRITECHAR(s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00006121 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006122 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006123 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006124 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006125 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006126 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006127#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006128
Victor Stinner16e6a802011-12-12 13:24:15 +01006129 if (unicode_resize(&v, i) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006130 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006131 Py_XDECREF(errorHandler);
6132 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006133 return unicode_result(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00006134
Benjamin Peterson29060642009-01-31 22:14:21 +00006135 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00006136 PyErr_SetString(
6137 PyExc_UnicodeError,
6138 "\\N escapes not supported (can't load unicodedata module)"
6139 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00006140 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006141 Py_XDECREF(errorHandler);
6142 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00006143 return NULL;
6144
Benjamin Peterson29060642009-01-31 22:14:21 +00006145 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006146 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006147 Py_XDECREF(errorHandler);
6148 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149 return NULL;
6150}
6151
6152/* Return a Unicode-Escape string version of the Unicode object.
6153
6154 If quotes is true, the string is enclosed in u"" or u'' quotes as
6155 appropriate.
6156
6157*/
6158
Alexander Belopolsky40018472011-02-26 01:02:56 +00006159PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006160PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006161{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006162 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006163 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006164 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006165 int kind;
6166 void *data;
6167 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006168
Thomas Wouters89f507f2006-12-13 04:49:30 +00006169 /* Initial allocation is based on the longest-possible unichr
6170 escape.
6171
6172 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
6173 unichr, so in this case it's the longest unichr escape. In
6174 narrow (UTF-16) builds this is five chars per source unichr
6175 since there are two unichrs in the surrogate pair, so in narrow
6176 (UTF-16) builds it's not the longest unichr escape.
6177
6178 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
6179 so in the narrow (UTF-16) build case it's the longest unichr
6180 escape.
6181 */
6182
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006183 if (!PyUnicode_Check(unicode)) {
6184 PyErr_BadArgument();
6185 return NULL;
6186 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006187 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006188 return NULL;
6189 len = PyUnicode_GET_LENGTH(unicode);
6190 kind = PyUnicode_KIND(unicode);
6191 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06006192 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006193 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
6194 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
6195 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
6196 }
6197
6198 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006199 return PyBytes_FromStringAndSize(NULL, 0);
6200
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006201 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006202 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006203
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006204 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00006205 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006206 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00006207 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006208 if (repr == NULL)
6209 return NULL;
6210
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006211 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006212
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006213 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006214 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006215
Walter Dörwald79e913e2007-05-12 11:08:06 +00006216 /* Escape backslashes */
6217 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006218 *p++ = '\\';
6219 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00006220 continue;
Tim Petersced69f82003-09-16 20:30:58 +00006221 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006222
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006223 /* Map 21-bit characters to '\U00xxxxxx' */
6224 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006225 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006226 *p++ = '\\';
6227 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006228 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
6229 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
6230 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6231 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6232 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6233 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6234 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6235 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00006236 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006237 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006238
Guido van Rossumd57fd912000-03-10 22:53:23 +00006239 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006240 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006241 *p++ = '\\';
6242 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006243 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6244 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6245 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6246 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006247 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006248
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006249 /* Map special whitespace to '\t', \n', '\r' */
6250 else if (ch == '\t') {
6251 *p++ = '\\';
6252 *p++ = 't';
6253 }
6254 else if (ch == '\n') {
6255 *p++ = '\\';
6256 *p++ = 'n';
6257 }
6258 else if (ch == '\r') {
6259 *p++ = '\\';
6260 *p++ = 'r';
6261 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006262
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006263 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006264 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006265 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006266 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006267 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6268 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006269 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006270
Guido van Rossumd57fd912000-03-10 22:53:23 +00006271 /* Copy everything else as-is */
6272 else
6273 *p++ = (char) ch;
6274 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006275
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006276 assert(p - PyBytes_AS_STRING(repr) > 0);
6277 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6278 return NULL;
6279 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006280}
6281
Alexander Belopolsky40018472011-02-26 01:02:56 +00006282PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006283PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6284 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006285{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006286 PyObject *result;
6287 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6288 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006289 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006290 result = PyUnicode_AsUnicodeEscapeString(tmp);
6291 Py_DECREF(tmp);
6292 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006293}
6294
6295/* --- Raw Unicode Escape Codec ------------------------------------------- */
6296
Alexander Belopolsky40018472011-02-26 01:02:56 +00006297PyObject *
6298PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006299 Py_ssize_t size,
6300 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006301{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006302 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006303 Py_ssize_t startinpos;
6304 Py_ssize_t endinpos;
6305 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006306 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006307 const char *end;
6308 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006309 PyObject *errorHandler = NULL;
6310 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006311
Guido van Rossumd57fd912000-03-10 22:53:23 +00006312 /* Escaped strings will always be longer than the resulting
6313 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006314 length after conversion to the true value. (But decoding error
6315 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006316 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006317 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006318 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006319 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006320 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006321 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006322 end = s + size;
6323 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006324 unsigned char c;
6325 Py_UCS4 x;
6326 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006327 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006328
Benjamin Peterson29060642009-01-31 22:14:21 +00006329 /* Non-escape characters are interpreted as Unicode ordinals */
6330 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006331 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6332 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006333 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006334 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006335 startinpos = s-starts;
6336
6337 /* \u-escapes are only interpreted iff the number of leading
6338 backslashes if odd */
6339 bs = s;
6340 for (;s < end;) {
6341 if (*s != '\\')
6342 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006343 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6344 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006345 }
6346 if (((s - bs) & 1) == 0 ||
6347 s >= end ||
6348 (*s != 'u' && *s != 'U')) {
6349 continue;
6350 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006351 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006352 count = *s=='u' ? 4 : 8;
6353 s++;
6354
6355 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006356 for (x = 0, i = 0; i < count; ++i, ++s) {
6357 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006358 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006359 endinpos = s-starts;
6360 if (unicode_decode_call_errorhandler(
6361 errors, &errorHandler,
6362 "rawunicodeescape", "truncated \\uXXXX",
6363 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006364 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006365 goto onError;
6366 goto nextByte;
6367 }
6368 x = (x<<4) & ~0xF;
6369 if (c >= '0' && c <= '9')
6370 x += c - '0';
6371 else if (c >= 'a' && c <= 'f')
6372 x += 10 + c - 'a';
6373 else
6374 x += 10 + c - 'A';
6375 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006376 if (x <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006377 if (unicode_putchar(&v, &outpos, x) < 0)
6378 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006379 } else {
6380 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006381 if (unicode_decode_call_errorhandler(
6382 errors, &errorHandler,
6383 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006384 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006385 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006386 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006387 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006388 nextByte:
6389 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006390 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006391 if (unicode_resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006392 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006393 Py_XDECREF(errorHandler);
6394 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006395 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00006396
Benjamin Peterson29060642009-01-31 22:14:21 +00006397 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006398 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006399 Py_XDECREF(errorHandler);
6400 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006401 return NULL;
6402}
6403
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006404
Alexander Belopolsky40018472011-02-26 01:02:56 +00006405PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006406PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006407{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006408 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006409 char *p;
6410 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006411 Py_ssize_t expandsize, pos;
6412 int kind;
6413 void *data;
6414 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006415
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006416 if (!PyUnicode_Check(unicode)) {
6417 PyErr_BadArgument();
6418 return NULL;
6419 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006420 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006421 return NULL;
6422 kind = PyUnicode_KIND(unicode);
6423 data = PyUnicode_DATA(unicode);
6424 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006425 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6426 bytes, and 1 byte characters 4. */
6427 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006428
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006429 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006430 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006431
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006432 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006433 if (repr == NULL)
6434 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006435 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006436 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006437
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006438 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006439 for (pos = 0; pos < len; pos++) {
6440 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006441 /* Map 32-bit characters to '\Uxxxxxxxx' */
6442 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006443 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006444 *p++ = '\\';
6445 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006446 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6447 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6448 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6449 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6450 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6451 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6452 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6453 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006454 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006455 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006456 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006457 *p++ = '\\';
6458 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006459 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6460 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6461 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6462 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006463 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006464 /* Copy everything else as-is */
6465 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006466 *p++ = (char) ch;
6467 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006468
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006469 assert(p > q);
6470 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006471 return NULL;
6472 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006473}
6474
Alexander Belopolsky40018472011-02-26 01:02:56 +00006475PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006476PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6477 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006478{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006479 PyObject *result;
6480 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6481 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006482 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006483 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6484 Py_DECREF(tmp);
6485 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006486}
6487
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006488/* --- Unicode Internal Codec ------------------------------------------- */
6489
Alexander Belopolsky40018472011-02-26 01:02:56 +00006490PyObject *
6491_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006492 Py_ssize_t size,
6493 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006494{
6495 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006496 Py_ssize_t startinpos;
6497 Py_ssize_t endinpos;
6498 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006499 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006500 const char *end;
6501 const char *reason;
6502 PyObject *errorHandler = NULL;
6503 PyObject *exc = NULL;
6504
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006505 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006506 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006507 1))
6508 return NULL;
6509
Thomas Wouters89f507f2006-12-13 04:49:30 +00006510 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006511 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006512 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006513 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006514 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006515 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006516 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006517 end = s + size;
6518
6519 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006520 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006521 Py_UCS4 ch;
6522 /* We copy the raw representation one byte at a time because the
6523 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006524 ((char *) &uch)[0] = s[0];
6525 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006526#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006527 ((char *) &uch)[2] = s[2];
6528 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006529#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006530 ch = uch;
6531
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006532 /* We have to sanity check the raw data, otherwise doom looms for
6533 some malformed UCS-4 data. */
6534 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006535#ifdef Py_UNICODE_WIDE
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006536 ch > 0x10ffff ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006537#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006538 end-s < Py_UNICODE_SIZE
6539 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006540 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006541 startinpos = s - starts;
6542 if (end-s < Py_UNICODE_SIZE) {
6543 endinpos = end-starts;
6544 reason = "truncated input";
6545 }
6546 else {
6547 endinpos = s - starts + Py_UNICODE_SIZE;
6548 reason = "illegal code point (> 0x10FFFF)";
6549 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006550 if (unicode_decode_call_errorhandler(
6551 errors, &errorHandler,
6552 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006553 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006554 &v, &outpos))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006555 goto onError;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006556 continue;
6557 }
6558
6559 s += Py_UNICODE_SIZE;
6560#ifndef Py_UNICODE_WIDE
Victor Stinner551ac952011-11-29 22:58:13 +01006561 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006562 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006563 Py_UNICODE uch2;
6564 ((char *) &uch2)[0] = s[0];
6565 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006566 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006567 {
Victor Stinner551ac952011-11-29 22:58:13 +01006568 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006569 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006570 }
6571 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006572#endif
6573
6574 if (unicode_putchar(&v, &outpos, ch) < 0)
6575 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006576 }
6577
Victor Stinner16e6a802011-12-12 13:24:15 +01006578 if (unicode_resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006579 goto onError;
6580 Py_XDECREF(errorHandler);
6581 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006582 return unicode_result(v);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006583
Benjamin Peterson29060642009-01-31 22:14:21 +00006584 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006585 Py_XDECREF(v);
6586 Py_XDECREF(errorHandler);
6587 Py_XDECREF(exc);
6588 return NULL;
6589}
6590
Guido van Rossumd57fd912000-03-10 22:53:23 +00006591/* --- Latin-1 Codec ------------------------------------------------------ */
6592
Alexander Belopolsky40018472011-02-26 01:02:56 +00006593PyObject *
6594PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006595 Py_ssize_t size,
6596 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006597{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006598 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006599 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006600}
6601
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006602/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006603static void
6604make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006605 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006606 PyObject *unicode,
6607 Py_ssize_t startpos, Py_ssize_t endpos,
6608 const char *reason)
6609{
6610 if (*exceptionObject == NULL) {
6611 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006612 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006613 encoding, unicode, startpos, endpos, reason);
6614 }
6615 else {
6616 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6617 goto onError;
6618 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6619 goto onError;
6620 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6621 goto onError;
6622 return;
6623 onError:
6624 Py_DECREF(*exceptionObject);
6625 *exceptionObject = NULL;
6626 }
6627}
6628
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006629/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006630static void
6631raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006632 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006633 PyObject *unicode,
6634 Py_ssize_t startpos, Py_ssize_t endpos,
6635 const char *reason)
6636{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006637 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006638 encoding, unicode, startpos, endpos, reason);
6639 if (*exceptionObject != NULL)
6640 PyCodec_StrictErrors(*exceptionObject);
6641}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006642
6643/* error handling callback helper:
6644 build arguments, call the callback and check the arguments,
6645 put the result into newpos and return the replacement string, which
6646 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006647static PyObject *
6648unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006649 PyObject **errorHandler,
6650 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006651 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006652 Py_ssize_t startpos, Py_ssize_t endpos,
6653 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006654{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006655 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006656 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006657 PyObject *restuple;
6658 PyObject *resunicode;
6659
6660 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006661 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006662 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006663 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006664 }
6665
Benjamin Petersonbac79492012-01-14 13:34:47 -05006666 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006667 return NULL;
6668 len = PyUnicode_GET_LENGTH(unicode);
6669
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006670 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006671 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006672 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006673 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006674
6675 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006676 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006677 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006678 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006679 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006680 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006681 Py_DECREF(restuple);
6682 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006683 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006684 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006685 &resunicode, newpos)) {
6686 Py_DECREF(restuple);
6687 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006688 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006689 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6690 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6691 Py_DECREF(restuple);
6692 return NULL;
6693 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006694 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006695 *newpos = len + *newpos;
6696 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006697 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6698 Py_DECREF(restuple);
6699 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006700 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006701 Py_INCREF(resunicode);
6702 Py_DECREF(restuple);
6703 return resunicode;
6704}
6705
Alexander Belopolsky40018472011-02-26 01:02:56 +00006706static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006707unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006708 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006709 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006710{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006711 /* input state */
6712 Py_ssize_t pos=0, size;
6713 int kind;
6714 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006715 /* output object */
6716 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006717 /* pointer into the output */
6718 char *str;
6719 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006720 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006721 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6722 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006723 PyObject *errorHandler = NULL;
6724 PyObject *exc = NULL;
6725 /* the following variable is used for caching string comparisons
6726 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6727 int known_errorHandler = -1;
6728
Benjamin Petersonbac79492012-01-14 13:34:47 -05006729 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006730 return NULL;
6731 size = PyUnicode_GET_LENGTH(unicode);
6732 kind = PyUnicode_KIND(unicode);
6733 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006734 /* allocate enough for a simple encoding without
6735 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006736 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006737 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006738 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006739 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006740 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006741 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006742 ressize = size;
6743
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006744 while (pos < size) {
6745 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006746
Benjamin Peterson29060642009-01-31 22:14:21 +00006747 /* can we encode this? */
6748 if (c<limit) {
6749 /* no overflow check, because we know that the space is enough */
6750 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006751 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006752 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006753 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006754 Py_ssize_t requiredsize;
6755 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006756 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006757 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006758 Py_ssize_t collstart = pos;
6759 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006760 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006761 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006762 ++collend;
6763 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6764 if (known_errorHandler==-1) {
6765 if ((errors==NULL) || (!strcmp(errors, "strict")))
6766 known_errorHandler = 1;
6767 else if (!strcmp(errors, "replace"))
6768 known_errorHandler = 2;
6769 else if (!strcmp(errors, "ignore"))
6770 known_errorHandler = 3;
6771 else if (!strcmp(errors, "xmlcharrefreplace"))
6772 known_errorHandler = 4;
6773 else
6774 known_errorHandler = 0;
6775 }
6776 switch (known_errorHandler) {
6777 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006778 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006779 goto onError;
6780 case 2: /* replace */
6781 while (collstart++<collend)
6782 *str++ = '?'; /* fall through */
6783 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006784 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006785 break;
6786 case 4: /* xmlcharrefreplace */
6787 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006788 /* determine replacement size */
6789 for (i = collstart, repsize = 0; i < collend; ++i) {
6790 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6791 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006792 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006793 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006794 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006795 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006796 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006797 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006798 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006799 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006800 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006801 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006802 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006803 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006804 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006805 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006806 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006807 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006808 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006809 if (requiredsize > ressize) {
6810 if (requiredsize<2*ressize)
6811 requiredsize = 2*ressize;
6812 if (_PyBytes_Resize(&res, requiredsize))
6813 goto onError;
6814 str = PyBytes_AS_STRING(res) + respos;
6815 ressize = requiredsize;
6816 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006817 /* generate replacement */
6818 for (i = collstart; i < collend; ++i) {
6819 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006820 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006821 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006822 break;
6823 default:
6824 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006825 encoding, reason, unicode, &exc,
6826 collstart, collend, &newpos);
6827 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006828 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006829 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006830 if (PyBytes_Check(repunicode)) {
6831 /* Directly copy bytes result to output. */
6832 repsize = PyBytes_Size(repunicode);
6833 if (repsize > 1) {
6834 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006835 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006836 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6837 Py_DECREF(repunicode);
6838 goto onError;
6839 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006840 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006841 ressize += repsize-1;
6842 }
6843 memcpy(str, PyBytes_AsString(repunicode), repsize);
6844 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006845 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006846 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006847 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006848 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006849 /* need more space? (at least enough for what we
6850 have+the replacement+the rest of the string, so
6851 we won't have to check space for encodable characters) */
6852 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006853 repsize = PyUnicode_GET_LENGTH(repunicode);
6854 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006855 if (requiredsize > ressize) {
6856 if (requiredsize<2*ressize)
6857 requiredsize = 2*ressize;
6858 if (_PyBytes_Resize(&res, requiredsize)) {
6859 Py_DECREF(repunicode);
6860 goto onError;
6861 }
6862 str = PyBytes_AS_STRING(res) + respos;
6863 ressize = requiredsize;
6864 }
6865 /* check if there is anything unencodable in the replacement
6866 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006867 for (i = 0; repsize-->0; ++i, ++str) {
6868 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006869 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006870 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006871 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006872 Py_DECREF(repunicode);
6873 goto onError;
6874 }
6875 *str = (char)c;
6876 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006877 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006878 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006879 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006880 }
6881 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006882 /* Resize if we allocated to much */
6883 size = str - PyBytes_AS_STRING(res);
6884 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006885 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006886 if (_PyBytes_Resize(&res, size) < 0)
6887 goto onError;
6888 }
6889
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006890 Py_XDECREF(errorHandler);
6891 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006892 return res;
6893
6894 onError:
6895 Py_XDECREF(res);
6896 Py_XDECREF(errorHandler);
6897 Py_XDECREF(exc);
6898 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006899}
6900
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006901/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006902PyObject *
6903PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006904 Py_ssize_t size,
6905 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006906{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006907 PyObject *result;
6908 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6909 if (unicode == NULL)
6910 return NULL;
6911 result = unicode_encode_ucs1(unicode, errors, 256);
6912 Py_DECREF(unicode);
6913 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006914}
6915
Alexander Belopolsky40018472011-02-26 01:02:56 +00006916PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006917_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006918{
6919 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006920 PyErr_BadArgument();
6921 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006922 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006923 if (PyUnicode_READY(unicode) == -1)
6924 return NULL;
6925 /* Fast path: if it is a one-byte string, construct
6926 bytes object directly. */
6927 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6928 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6929 PyUnicode_GET_LENGTH(unicode));
6930 /* Non-Latin-1 characters present. Defer to above function to
6931 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006932 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006933}
6934
6935PyObject*
6936PyUnicode_AsLatin1String(PyObject *unicode)
6937{
6938 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006939}
6940
6941/* --- 7-bit ASCII Codec -------------------------------------------------- */
6942
Alexander Belopolsky40018472011-02-26 01:02:56 +00006943PyObject *
6944PyUnicode_DecodeASCII(const char *s,
6945 Py_ssize_t size,
6946 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006947{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006948 const char *starts = s;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006949 PyObject *v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006950 int kind;
6951 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006952 Py_ssize_t startinpos;
6953 Py_ssize_t endinpos;
6954 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006955 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006956 int has_error;
6957 const unsigned char *p = (const unsigned char *)s;
6958 const unsigned char *end = p + size;
6959 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006960 PyObject *errorHandler = NULL;
6961 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006962
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006963 if (size == 0) {
6964 Py_INCREF(unicode_empty);
6965 return unicode_empty;
6966 }
6967
Guido van Rossumd57fd912000-03-10 22:53:23 +00006968 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006969 if (size == 1 && (unsigned char)s[0] < 128)
6970 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006971
Victor Stinner702c7342011-10-05 13:50:52 +02006972 has_error = 0;
6973 while (p < end && !has_error) {
6974 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6975 an explanation. */
6976 if (!((size_t) p & LONG_PTR_MASK)) {
6977 /* Help register allocation */
6978 register const unsigned char *_p = p;
6979 while (_p < aligned_end) {
6980 unsigned long value = *(unsigned long *) _p;
6981 if (value & ASCII_CHAR_MASK) {
6982 has_error = 1;
6983 break;
6984 }
6985 _p += SIZEOF_LONG;
6986 }
6987 if (_p == end)
6988 break;
6989 if (has_error)
6990 break;
6991 p = _p;
6992 }
6993 if (*p & 0x80) {
6994 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006995 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006996 }
6997 else {
6998 ++p;
6999 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007000 }
Victor Stinner702c7342011-10-05 13:50:52 +02007001 if (!has_error)
7002 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00007003
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007004 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007005 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007006 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007007 kind = PyUnicode_KIND(v);
7008 data = PyUnicode_DATA(v);
7009 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007010 e = s + size;
7011 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007012 register unsigned char c = (unsigned char)*s;
7013 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007014 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00007015 ++s;
7016 }
7017 else {
7018 startinpos = s-starts;
7019 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00007020 if (unicode_decode_call_errorhandler(
7021 errors, &errorHandler,
7022 "ascii", "ordinal not in range(128)",
7023 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007024 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00007025 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007026 kind = PyUnicode_KIND(v);
7027 data = PyUnicode_DATA(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00007028 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007029 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007030 if (unicode_resize(&v, outpos) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007031 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007032 Py_XDECREF(errorHandler);
7033 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02007034 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01007035 return v;
Tim Petersced69f82003-09-16 20:30:58 +00007036
Benjamin Peterson29060642009-01-31 22:14:21 +00007037 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007038 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007039 Py_XDECREF(errorHandler);
7040 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007041 return NULL;
7042}
7043
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007044/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007045PyObject *
7046PyUnicode_EncodeASCII(const Py_UNICODE *p,
7047 Py_ssize_t size,
7048 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007049{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007050 PyObject *result;
7051 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7052 if (unicode == NULL)
7053 return NULL;
7054 result = unicode_encode_ucs1(unicode, errors, 128);
7055 Py_DECREF(unicode);
7056 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007057}
7058
Alexander Belopolsky40018472011-02-26 01:02:56 +00007059PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007060_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007061{
7062 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007063 PyErr_BadArgument();
7064 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007065 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007066 if (PyUnicode_READY(unicode) == -1)
7067 return NULL;
7068 /* Fast path: if it is an ASCII-only string, construct bytes object
7069 directly. Else defer to above function to raise the exception. */
7070 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
7071 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7072 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007073 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007074}
7075
7076PyObject *
7077PyUnicode_AsASCIIString(PyObject *unicode)
7078{
7079 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007080}
7081
Victor Stinner99b95382011-07-04 14:23:54 +02007082#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007083
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007084/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007085
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007086#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007087#define NEED_RETRY
7088#endif
7089
Victor Stinner3a50e702011-10-18 21:21:00 +02007090#ifndef WC_ERR_INVALID_CHARS
7091# define WC_ERR_INVALID_CHARS 0x0080
7092#endif
7093
7094static char*
7095code_page_name(UINT code_page, PyObject **obj)
7096{
7097 *obj = NULL;
7098 if (code_page == CP_ACP)
7099 return "mbcs";
7100 if (code_page == CP_UTF7)
7101 return "CP_UTF7";
7102 if (code_page == CP_UTF8)
7103 return "CP_UTF8";
7104
7105 *obj = PyBytes_FromFormat("cp%u", code_page);
7106 if (*obj == NULL)
7107 return NULL;
7108 return PyBytes_AS_STRING(*obj);
7109}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007110
Alexander Belopolsky40018472011-02-26 01:02:56 +00007111static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007112is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007113{
7114 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02007115 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007116
Victor Stinner3a50e702011-10-18 21:21:00 +02007117 if (!IsDBCSLeadByteEx(code_page, *curr))
7118 return 0;
7119
7120 prev = CharPrevExA(code_page, s, curr, 0);
7121 if (prev == curr)
7122 return 1;
7123 /* FIXME: This code is limited to "true" double-byte encodings,
7124 as it assumes an incomplete character consists of a single
7125 byte. */
7126 if (curr - prev == 2)
7127 return 1;
7128 if (!IsDBCSLeadByteEx(code_page, *prev))
7129 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007130 return 0;
7131}
7132
Victor Stinner3a50e702011-10-18 21:21:00 +02007133static DWORD
7134decode_code_page_flags(UINT code_page)
7135{
7136 if (code_page == CP_UTF7) {
7137 /* The CP_UTF7 decoder only supports flags=0 */
7138 return 0;
7139 }
7140 else
7141 return MB_ERR_INVALID_CHARS;
7142}
7143
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007144/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007145 * Decode a byte string from a Windows code page into unicode object in strict
7146 * mode.
7147 *
7148 * Returns consumed size if succeed, returns -2 on decode error, or raise a
7149 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007150 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007151static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007152decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007153 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007154 const char *in,
7155 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007156{
Victor Stinner3a50e702011-10-18 21:21:00 +02007157 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007158 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007159 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007160
7161 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007162 assert(insize > 0);
7163 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7164 if (outsize <= 0)
7165 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007166
7167 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007168 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007169 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007170 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007171 if (*v == NULL)
7172 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007173 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007174 }
7175 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007176 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007177 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007178 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007179 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007180 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007181 }
7182
7183 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007184 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7185 if (outsize <= 0)
7186 goto error;
7187 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007188
Victor Stinner3a50e702011-10-18 21:21:00 +02007189error:
7190 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7191 return -2;
7192 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007193 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007194}
7195
Victor Stinner3a50e702011-10-18 21:21:00 +02007196/*
7197 * Decode a byte string from a code page into unicode object with an error
7198 * handler.
7199 *
7200 * Returns consumed size if succeed, or raise a WindowsError or
7201 * UnicodeDecodeError exception and returns -1 on error.
7202 */
7203static int
7204decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007205 PyObject **v,
7206 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007207 const char *errors)
7208{
7209 const char *startin = in;
7210 const char *endin = in + size;
7211 const DWORD flags = decode_code_page_flags(code_page);
7212 /* Ideally, we should get reason from FormatMessage. This is the Windows
7213 2000 English version of the message. */
7214 const char *reason = "No mapping for the Unicode character exists "
7215 "in the target code page.";
7216 /* each step cannot decode more than 1 character, but a character can be
7217 represented as a surrogate pair */
7218 wchar_t buffer[2], *startout, *out;
7219 int insize, outsize;
7220 PyObject *errorHandler = NULL;
7221 PyObject *exc = NULL;
7222 PyObject *encoding_obj = NULL;
7223 char *encoding;
7224 DWORD err;
7225 int ret = -1;
7226
7227 assert(size > 0);
7228
7229 encoding = code_page_name(code_page, &encoding_obj);
7230 if (encoding == NULL)
7231 return -1;
7232
7233 if (errors == NULL || strcmp(errors, "strict") == 0) {
7234 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7235 UnicodeDecodeError. */
7236 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7237 if (exc != NULL) {
7238 PyCodec_StrictErrors(exc);
7239 Py_CLEAR(exc);
7240 }
7241 goto error;
7242 }
7243
7244 if (*v == NULL) {
7245 /* Create unicode object */
7246 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7247 PyErr_NoMemory();
7248 goto error;
7249 }
Victor Stinnerab595942011-12-17 04:59:06 +01007250 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007251 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007252 if (*v == NULL)
7253 goto error;
7254 startout = PyUnicode_AS_UNICODE(*v);
7255 }
7256 else {
7257 /* Extend unicode object */
7258 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7259 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7260 PyErr_NoMemory();
7261 goto error;
7262 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007263 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007264 goto error;
7265 startout = PyUnicode_AS_UNICODE(*v) + n;
7266 }
7267
7268 /* Decode the byte string character per character */
7269 out = startout;
7270 while (in < endin)
7271 {
7272 /* Decode a character */
7273 insize = 1;
7274 do
7275 {
7276 outsize = MultiByteToWideChar(code_page, flags,
7277 in, insize,
7278 buffer, Py_ARRAY_LENGTH(buffer));
7279 if (outsize > 0)
7280 break;
7281 err = GetLastError();
7282 if (err != ERROR_NO_UNICODE_TRANSLATION
7283 && err != ERROR_INSUFFICIENT_BUFFER)
7284 {
7285 PyErr_SetFromWindowsErr(0);
7286 goto error;
7287 }
7288 insize++;
7289 }
7290 /* 4=maximum length of a UTF-8 sequence */
7291 while (insize <= 4 && (in + insize) <= endin);
7292
7293 if (outsize <= 0) {
7294 Py_ssize_t startinpos, endinpos, outpos;
7295
7296 startinpos = in - startin;
7297 endinpos = startinpos + 1;
7298 outpos = out - PyUnicode_AS_UNICODE(*v);
7299 if (unicode_decode_call_errorhandler(
7300 errors, &errorHandler,
7301 encoding, reason,
7302 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007303 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007304 {
7305 goto error;
7306 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007307 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007308 }
7309 else {
7310 in += insize;
7311 memcpy(out, buffer, outsize * sizeof(wchar_t));
7312 out += outsize;
7313 }
7314 }
7315
7316 /* write a NUL character at the end */
7317 *out = 0;
7318
7319 /* Extend unicode object */
7320 outsize = out - startout;
7321 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007322 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007323 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01007324 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007325
7326error:
7327 Py_XDECREF(encoding_obj);
7328 Py_XDECREF(errorHandler);
7329 Py_XDECREF(exc);
7330 return ret;
7331}
7332
Victor Stinner3a50e702011-10-18 21:21:00 +02007333static PyObject *
7334decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007335 const char *s, Py_ssize_t size,
7336 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007337{
Victor Stinner76a31a62011-11-04 00:05:13 +01007338 PyObject *v = NULL;
7339 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007340
Victor Stinner3a50e702011-10-18 21:21:00 +02007341 if (code_page < 0) {
7342 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7343 return NULL;
7344 }
7345
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007346 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007347 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007348
Victor Stinner76a31a62011-11-04 00:05:13 +01007349 do
7350 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007351#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007352 if (size > INT_MAX) {
7353 chunk_size = INT_MAX;
7354 final = 0;
7355 done = 0;
7356 }
7357 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007358#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007359 {
7360 chunk_size = (int)size;
7361 final = (consumed == NULL);
7362 done = 1;
7363 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007364
Victor Stinner76a31a62011-11-04 00:05:13 +01007365 /* Skip trailing lead-byte unless 'final' is set */
7366 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
7367 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007368
Victor Stinner76a31a62011-11-04 00:05:13 +01007369 if (chunk_size == 0 && done) {
7370 if (v != NULL)
7371 break;
7372 Py_INCREF(unicode_empty);
7373 return unicode_empty;
7374 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007375
Victor Stinner76a31a62011-11-04 00:05:13 +01007376
7377 converted = decode_code_page_strict(code_page, &v,
7378 s, chunk_size);
7379 if (converted == -2)
7380 converted = decode_code_page_errors(code_page, &v,
7381 s, chunk_size,
7382 errors);
7383 assert(converted != 0);
7384
7385 if (converted < 0) {
7386 Py_XDECREF(v);
7387 return NULL;
7388 }
7389
7390 if (consumed)
7391 *consumed += converted;
7392
7393 s += converted;
7394 size -= converted;
7395 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007396
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007397 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007398}
7399
Alexander Belopolsky40018472011-02-26 01:02:56 +00007400PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007401PyUnicode_DecodeCodePageStateful(int code_page,
7402 const char *s,
7403 Py_ssize_t size,
7404 const char *errors,
7405 Py_ssize_t *consumed)
7406{
7407 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7408}
7409
7410PyObject *
7411PyUnicode_DecodeMBCSStateful(const char *s,
7412 Py_ssize_t size,
7413 const char *errors,
7414 Py_ssize_t *consumed)
7415{
7416 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7417}
7418
7419PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007420PyUnicode_DecodeMBCS(const char *s,
7421 Py_ssize_t size,
7422 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007423{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007424 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7425}
7426
Victor Stinner3a50e702011-10-18 21:21:00 +02007427static DWORD
7428encode_code_page_flags(UINT code_page, const char *errors)
7429{
7430 if (code_page == CP_UTF8) {
7431 if (winver.dwMajorVersion >= 6)
7432 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7433 and later */
7434 return WC_ERR_INVALID_CHARS;
7435 else
7436 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7437 return 0;
7438 }
7439 else if (code_page == CP_UTF7) {
7440 /* CP_UTF7 only supports flags=0 */
7441 return 0;
7442 }
7443 else {
7444 if (errors != NULL && strcmp(errors, "replace") == 0)
7445 return 0;
7446 else
7447 return WC_NO_BEST_FIT_CHARS;
7448 }
7449}
7450
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007451/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007452 * Encode a Unicode string to a Windows code page into a byte string in strict
7453 * mode.
7454 *
7455 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7456 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007457 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007458static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007459encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007460 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007461 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007462{
Victor Stinner554f3f02010-06-16 23:33:54 +00007463 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007464 BOOL *pusedDefaultChar = &usedDefaultChar;
7465 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007466 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007467 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007468 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007469 const DWORD flags = encode_code_page_flags(code_page, NULL);
7470 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007471 /* Create a substring so that we can get the UTF-16 representation
7472 of just the slice under consideration. */
7473 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007474
Martin v. Löwis3d325192011-11-04 18:23:06 +01007475 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007476
Victor Stinner3a50e702011-10-18 21:21:00 +02007477 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007478 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007479 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007480 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007481
Victor Stinner2fc507f2011-11-04 20:06:39 +01007482 substring = PyUnicode_Substring(unicode, offset, offset+len);
7483 if (substring == NULL)
7484 return -1;
7485 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7486 if (p == NULL) {
7487 Py_DECREF(substring);
7488 return -1;
7489 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01007490
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007491 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007492 outsize = WideCharToMultiByte(code_page, flags,
7493 p, size,
7494 NULL, 0,
7495 NULL, pusedDefaultChar);
7496 if (outsize <= 0)
7497 goto error;
7498 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007499 if (pusedDefaultChar && *pusedDefaultChar) {
7500 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007501 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007502 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007503
Victor Stinner3a50e702011-10-18 21:21:00 +02007504 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007505 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007506 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007507 if (*outbytes == NULL) {
7508 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007509 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007510 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007511 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007512 }
7513 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007514 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007515 const Py_ssize_t n = PyBytes_Size(*outbytes);
7516 if (outsize > PY_SSIZE_T_MAX - n) {
7517 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007518 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007519 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007520 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007521 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7522 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007523 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007524 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007525 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007526 }
7527
7528 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007529 outsize = WideCharToMultiByte(code_page, flags,
7530 p, size,
7531 out, outsize,
7532 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007533 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007534 if (outsize <= 0)
7535 goto error;
7536 if (pusedDefaultChar && *pusedDefaultChar)
7537 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007538 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007539
Victor Stinner3a50e702011-10-18 21:21:00 +02007540error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007541 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007542 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7543 return -2;
7544 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007545 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007546}
7547
Victor Stinner3a50e702011-10-18 21:21:00 +02007548/*
7549 * Encode a Unicode string to a Windows code page into a byte string using a
7550 * error handler.
7551 *
7552 * Returns consumed characters if succeed, or raise a WindowsError and returns
7553 * -1 on other error.
7554 */
7555static int
7556encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007557 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007558 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007559{
Victor Stinner3a50e702011-10-18 21:21:00 +02007560 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007561 Py_ssize_t pos = unicode_offset;
7562 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007563 /* Ideally, we should get reason from FormatMessage. This is the Windows
7564 2000 English version of the message. */
7565 const char *reason = "invalid character";
7566 /* 4=maximum length of a UTF-8 sequence */
7567 char buffer[4];
7568 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7569 Py_ssize_t outsize;
7570 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007571 PyObject *errorHandler = NULL;
7572 PyObject *exc = NULL;
7573 PyObject *encoding_obj = NULL;
7574 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007575 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007576 PyObject *rep;
7577 int ret = -1;
7578
7579 assert(insize > 0);
7580
7581 encoding = code_page_name(code_page, &encoding_obj);
7582 if (encoding == NULL)
7583 return -1;
7584
7585 if (errors == NULL || strcmp(errors, "strict") == 0) {
7586 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7587 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007588 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007589 if (exc != NULL) {
7590 PyCodec_StrictErrors(exc);
7591 Py_DECREF(exc);
7592 }
7593 Py_XDECREF(encoding_obj);
7594 return -1;
7595 }
7596
7597 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7598 pusedDefaultChar = &usedDefaultChar;
7599 else
7600 pusedDefaultChar = NULL;
7601
7602 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7603 PyErr_NoMemory();
7604 goto error;
7605 }
7606 outsize = insize * Py_ARRAY_LENGTH(buffer);
7607
7608 if (*outbytes == NULL) {
7609 /* Create string object */
7610 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7611 if (*outbytes == NULL)
7612 goto error;
7613 out = PyBytes_AS_STRING(*outbytes);
7614 }
7615 else {
7616 /* Extend string object */
7617 Py_ssize_t n = PyBytes_Size(*outbytes);
7618 if (n > PY_SSIZE_T_MAX - outsize) {
7619 PyErr_NoMemory();
7620 goto error;
7621 }
7622 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7623 goto error;
7624 out = PyBytes_AS_STRING(*outbytes) + n;
7625 }
7626
7627 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007628 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007629 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007630 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7631 wchar_t chars[2];
7632 int charsize;
7633 if (ch < 0x10000) {
7634 chars[0] = (wchar_t)ch;
7635 charsize = 1;
7636 }
7637 else {
7638 ch -= 0x10000;
7639 chars[0] = 0xd800 + (ch >> 10);
7640 chars[1] = 0xdc00 + (ch & 0x3ff);
7641 charsize = 2;
7642 }
7643
Victor Stinner3a50e702011-10-18 21:21:00 +02007644 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007645 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007646 buffer, Py_ARRAY_LENGTH(buffer),
7647 NULL, pusedDefaultChar);
7648 if (outsize > 0) {
7649 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7650 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007651 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007652 memcpy(out, buffer, outsize);
7653 out += outsize;
7654 continue;
7655 }
7656 }
7657 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7658 PyErr_SetFromWindowsErr(0);
7659 goto error;
7660 }
7661
Victor Stinner3a50e702011-10-18 21:21:00 +02007662 rep = unicode_encode_call_errorhandler(
7663 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007664 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007665 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007666 if (rep == NULL)
7667 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007668 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007669
7670 if (PyBytes_Check(rep)) {
7671 outsize = PyBytes_GET_SIZE(rep);
7672 if (outsize != 1) {
7673 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7674 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7675 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7676 Py_DECREF(rep);
7677 goto error;
7678 }
7679 out = PyBytes_AS_STRING(*outbytes) + offset;
7680 }
7681 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7682 out += outsize;
7683 }
7684 else {
7685 Py_ssize_t i;
7686 enum PyUnicode_Kind kind;
7687 void *data;
7688
Benjamin Petersonbac79492012-01-14 13:34:47 -05007689 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007690 Py_DECREF(rep);
7691 goto error;
7692 }
7693
7694 outsize = PyUnicode_GET_LENGTH(rep);
7695 if (outsize != 1) {
7696 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7697 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7698 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7699 Py_DECREF(rep);
7700 goto error;
7701 }
7702 out = PyBytes_AS_STRING(*outbytes) + offset;
7703 }
7704 kind = PyUnicode_KIND(rep);
7705 data = PyUnicode_DATA(rep);
7706 for (i=0; i < outsize; i++) {
7707 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7708 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007709 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007710 encoding, unicode,
7711 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007712 "unable to encode error handler result to ASCII");
7713 Py_DECREF(rep);
7714 goto error;
7715 }
7716 *out = (unsigned char)ch;
7717 out++;
7718 }
7719 }
7720 Py_DECREF(rep);
7721 }
7722 /* write a NUL byte */
7723 *out = 0;
7724 outsize = out - PyBytes_AS_STRING(*outbytes);
7725 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7726 if (_PyBytes_Resize(outbytes, outsize) < 0)
7727 goto error;
7728 ret = 0;
7729
7730error:
7731 Py_XDECREF(encoding_obj);
7732 Py_XDECREF(errorHandler);
7733 Py_XDECREF(exc);
7734 return ret;
7735}
7736
Victor Stinner3a50e702011-10-18 21:21:00 +02007737static PyObject *
7738encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007739 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007740 const char *errors)
7741{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007742 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007743 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007744 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007745 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007746
Benjamin Petersonbac79492012-01-14 13:34:47 -05007747 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007748 return NULL;
7749 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007750
Victor Stinner3a50e702011-10-18 21:21:00 +02007751 if (code_page < 0) {
7752 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7753 return NULL;
7754 }
7755
Martin v. Löwis3d325192011-11-04 18:23:06 +01007756 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007757 return PyBytes_FromStringAndSize(NULL, 0);
7758
Victor Stinner7581cef2011-11-03 22:32:33 +01007759 offset = 0;
7760 do
7761 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007762#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007763 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007764 chunks. */
7765 if (len > INT_MAX/2) {
7766 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007767 done = 0;
7768 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007769 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007770#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007771 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007772 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007773 done = 1;
7774 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007775
Victor Stinner76a31a62011-11-04 00:05:13 +01007776 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007777 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007778 errors);
7779 if (ret == -2)
7780 ret = encode_code_page_errors(code_page, &outbytes,
7781 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007782 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007783 if (ret < 0) {
7784 Py_XDECREF(outbytes);
7785 return NULL;
7786 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007787
Victor Stinner7581cef2011-11-03 22:32:33 +01007788 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007789 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007790 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007791
Victor Stinner3a50e702011-10-18 21:21:00 +02007792 return outbytes;
7793}
7794
7795PyObject *
7796PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7797 Py_ssize_t size,
7798 const char *errors)
7799{
Victor Stinner7581cef2011-11-03 22:32:33 +01007800 PyObject *unicode, *res;
7801 unicode = PyUnicode_FromUnicode(p, size);
7802 if (unicode == NULL)
7803 return NULL;
7804 res = encode_code_page(CP_ACP, unicode, errors);
7805 Py_DECREF(unicode);
7806 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007807}
7808
7809PyObject *
7810PyUnicode_EncodeCodePage(int code_page,
7811 PyObject *unicode,
7812 const char *errors)
7813{
Victor Stinner7581cef2011-11-03 22:32:33 +01007814 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007815}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007816
Alexander Belopolsky40018472011-02-26 01:02:56 +00007817PyObject *
7818PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007819{
7820 if (!PyUnicode_Check(unicode)) {
7821 PyErr_BadArgument();
7822 return NULL;
7823 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007824 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007825}
7826
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007827#undef NEED_RETRY
7828
Victor Stinner99b95382011-07-04 14:23:54 +02007829#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007830
Guido van Rossumd57fd912000-03-10 22:53:23 +00007831/* --- Character Mapping Codec -------------------------------------------- */
7832
Alexander Belopolsky40018472011-02-26 01:02:56 +00007833PyObject *
7834PyUnicode_DecodeCharmap(const char *s,
7835 Py_ssize_t size,
7836 PyObject *mapping,
7837 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007838{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007839 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007840 Py_ssize_t startinpos;
7841 Py_ssize_t endinpos;
7842 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007843 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007844 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007845 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007846 PyObject *errorHandler = NULL;
7847 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007848
Guido van Rossumd57fd912000-03-10 22:53:23 +00007849 /* Default to Latin-1 */
7850 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007851 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007852
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007853 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007854 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007855 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007856 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007857 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007858 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007859 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007860 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007861 Py_ssize_t maplen;
7862 enum PyUnicode_Kind kind;
7863 void *data;
7864 Py_UCS4 x;
7865
Benjamin Petersonbac79492012-01-14 13:34:47 -05007866 if (PyUnicode_READY(mapping) == -1)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007867 return NULL;
7868
7869 maplen = PyUnicode_GET_LENGTH(mapping);
7870 data = PyUnicode_DATA(mapping);
7871 kind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007872 while (s < e) {
7873 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007874
Benjamin Peterson29060642009-01-31 22:14:21 +00007875 if (ch < maplen)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007876 x = PyUnicode_READ(kind, data, ch);
7877 else
7878 x = 0xfffe; /* invalid value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007879
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007880 if (x == 0xfffe)
7881 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007882 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007883 startinpos = s-starts;
7884 endinpos = startinpos+1;
7885 if (unicode_decode_call_errorhandler(
7886 errors, &errorHandler,
7887 "charmap", "character maps to <undefined>",
7888 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007889 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007890 goto onError;
7891 }
7892 continue;
7893 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007894
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007895 if (unicode_putchar(&v, &outpos, x) < 0)
7896 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007897 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007898 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007899 }
7900 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007901 while (s < e) {
7902 unsigned char ch = *s;
7903 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007904
Benjamin Peterson29060642009-01-31 22:14:21 +00007905 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7906 w = PyLong_FromLong((long)ch);
7907 if (w == NULL)
7908 goto onError;
7909 x = PyObject_GetItem(mapping, w);
7910 Py_DECREF(w);
7911 if (x == NULL) {
7912 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7913 /* No mapping found means: mapping is undefined. */
7914 PyErr_Clear();
7915 x = Py_None;
7916 Py_INCREF(x);
7917 } else
7918 goto onError;
7919 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007920
Benjamin Peterson29060642009-01-31 22:14:21 +00007921 /* Apply mapping */
7922 if (PyLong_Check(x)) {
7923 long value = PyLong_AS_LONG(x);
7924 if (value < 0 || value > 65535) {
7925 PyErr_SetString(PyExc_TypeError,
7926 "character mapping must be in range(65536)");
7927 Py_DECREF(x);
7928 goto onError;
7929 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007930 if (unicode_putchar(&v, &outpos, value) < 0)
7931 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007932 }
7933 else if (x == Py_None) {
7934 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007935 startinpos = s-starts;
7936 endinpos = startinpos+1;
7937 if (unicode_decode_call_errorhandler(
7938 errors, &errorHandler,
7939 "charmap", "character maps to <undefined>",
7940 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007941 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007942 Py_DECREF(x);
7943 goto onError;
7944 }
7945 Py_DECREF(x);
7946 continue;
7947 }
7948 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007949 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007950
Benjamin Petersonbac79492012-01-14 13:34:47 -05007951 if (PyUnicode_READY(x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007952 goto onError;
7953 targetsize = PyUnicode_GET_LENGTH(x);
7954
7955 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007956 /* 1-1 mapping */
Victor Stinner62aa4d02011-11-09 00:03:45 +01007957 if (unicode_putchar(&v, &outpos,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007958 PyUnicode_READ_CHAR(x, 0)) < 0)
7959 goto onError;
7960 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007961 else if (targetsize > 1) {
7962 /* 1-n mapping */
7963 if (targetsize > extrachars) {
7964 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007965 Py_ssize_t needed = (targetsize - extrachars) + \
7966 (targetsize << 2);
7967 extrachars += needed;
7968 /* XXX overflow detection missing */
Victor Stinner16e6a802011-12-12 13:24:15 +01007969 if (unicode_resize(&v,
7970 PyUnicode_GET_LENGTH(v) + needed) < 0)
7971 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007972 Py_DECREF(x);
7973 goto onError;
7974 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007975 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007976 if (unicode_widen(&v, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
7977 goto onError;
7978 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7979 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00007980 extrachars -= targetsize;
7981 }
7982 /* 1-0 mapping: skip the character */
7983 }
7984 else {
7985 /* wrong return value */
7986 PyErr_SetString(PyExc_TypeError,
7987 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007988 Py_DECREF(x);
7989 goto onError;
7990 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007991 Py_DECREF(x);
7992 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007993 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007994 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007995 if (unicode_resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01007996 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007997 Py_XDECREF(errorHandler);
7998 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007999 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00008000
Benjamin Peterson29060642009-01-31 22:14:21 +00008001 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008002 Py_XDECREF(errorHandler);
8003 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008004 Py_XDECREF(v);
8005 return NULL;
8006}
8007
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008008/* Charmap encoding: the lookup table */
8009
Alexander Belopolsky40018472011-02-26 01:02:56 +00008010struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008011 PyObject_HEAD
8012 unsigned char level1[32];
8013 int count2, count3;
8014 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008015};
8016
8017static PyObject*
8018encoding_map_size(PyObject *obj, PyObject* args)
8019{
8020 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008021 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008022 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008023}
8024
8025static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008026 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008027 PyDoc_STR("Return the size (in bytes) of this object") },
8028 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008029};
8030
8031static void
8032encoding_map_dealloc(PyObject* o)
8033{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008034 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008035}
8036
8037static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008038 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008039 "EncodingMap", /*tp_name*/
8040 sizeof(struct encoding_map), /*tp_basicsize*/
8041 0, /*tp_itemsize*/
8042 /* methods */
8043 encoding_map_dealloc, /*tp_dealloc*/
8044 0, /*tp_print*/
8045 0, /*tp_getattr*/
8046 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00008047 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008048 0, /*tp_repr*/
8049 0, /*tp_as_number*/
8050 0, /*tp_as_sequence*/
8051 0, /*tp_as_mapping*/
8052 0, /*tp_hash*/
8053 0, /*tp_call*/
8054 0, /*tp_str*/
8055 0, /*tp_getattro*/
8056 0, /*tp_setattro*/
8057 0, /*tp_as_buffer*/
8058 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8059 0, /*tp_doc*/
8060 0, /*tp_traverse*/
8061 0, /*tp_clear*/
8062 0, /*tp_richcompare*/
8063 0, /*tp_weaklistoffset*/
8064 0, /*tp_iter*/
8065 0, /*tp_iternext*/
8066 encoding_map_methods, /*tp_methods*/
8067 0, /*tp_members*/
8068 0, /*tp_getset*/
8069 0, /*tp_base*/
8070 0, /*tp_dict*/
8071 0, /*tp_descr_get*/
8072 0, /*tp_descr_set*/
8073 0, /*tp_dictoffset*/
8074 0, /*tp_init*/
8075 0, /*tp_alloc*/
8076 0, /*tp_new*/
8077 0, /*tp_free*/
8078 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008079};
8080
8081PyObject*
8082PyUnicode_BuildEncodingMap(PyObject* string)
8083{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008084 PyObject *result;
8085 struct encoding_map *mresult;
8086 int i;
8087 int need_dict = 0;
8088 unsigned char level1[32];
8089 unsigned char level2[512];
8090 unsigned char *mlevel1, *mlevel2, *mlevel3;
8091 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008092 int kind;
8093 void *data;
8094 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008095
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008096 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008097 PyErr_BadArgument();
8098 return NULL;
8099 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008100 kind = PyUnicode_KIND(string);
8101 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008102 memset(level1, 0xFF, sizeof level1);
8103 memset(level2, 0xFF, sizeof level2);
8104
8105 /* If there isn't a one-to-one mapping of NULL to \0,
8106 or if there are non-BMP characters, we need to use
8107 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008108 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008109 need_dict = 1;
8110 for (i = 1; i < 256; i++) {
8111 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008112 ch = PyUnicode_READ(kind, data, i);
8113 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008114 need_dict = 1;
8115 break;
8116 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008117 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008118 /* unmapped character */
8119 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008120 l1 = ch >> 11;
8121 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008122 if (level1[l1] == 0xFF)
8123 level1[l1] = count2++;
8124 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008125 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008126 }
8127
8128 if (count2 >= 0xFF || count3 >= 0xFF)
8129 need_dict = 1;
8130
8131 if (need_dict) {
8132 PyObject *result = PyDict_New();
8133 PyObject *key, *value;
8134 if (!result)
8135 return NULL;
8136 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008137 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008138 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008139 if (!key || !value)
8140 goto failed1;
8141 if (PyDict_SetItem(result, key, value) == -1)
8142 goto failed1;
8143 Py_DECREF(key);
8144 Py_DECREF(value);
8145 }
8146 return result;
8147 failed1:
8148 Py_XDECREF(key);
8149 Py_XDECREF(value);
8150 Py_DECREF(result);
8151 return NULL;
8152 }
8153
8154 /* Create a three-level trie */
8155 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8156 16*count2 + 128*count3 - 1);
8157 if (!result)
8158 return PyErr_NoMemory();
8159 PyObject_Init(result, &EncodingMapType);
8160 mresult = (struct encoding_map*)result;
8161 mresult->count2 = count2;
8162 mresult->count3 = count3;
8163 mlevel1 = mresult->level1;
8164 mlevel2 = mresult->level23;
8165 mlevel3 = mresult->level23 + 16*count2;
8166 memcpy(mlevel1, level1, 32);
8167 memset(mlevel2, 0xFF, 16*count2);
8168 memset(mlevel3, 0, 128*count3);
8169 count3 = 0;
8170 for (i = 1; i < 256; i++) {
8171 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008172 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008173 /* unmapped character */
8174 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008175 o1 = PyUnicode_READ(kind, data, i)>>11;
8176 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008177 i2 = 16*mlevel1[o1] + o2;
8178 if (mlevel2[i2] == 0xFF)
8179 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008180 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008181 i3 = 128*mlevel2[i2] + o3;
8182 mlevel3[i3] = i;
8183 }
8184 return result;
8185}
8186
8187static int
Victor Stinner22168992011-11-20 17:09:18 +01008188encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008189{
8190 struct encoding_map *map = (struct encoding_map*)mapping;
8191 int l1 = c>>11;
8192 int l2 = (c>>7) & 0xF;
8193 int l3 = c & 0x7F;
8194 int i;
8195
Victor Stinner22168992011-11-20 17:09:18 +01008196 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008197 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008198 if (c == 0)
8199 return 0;
8200 /* level 1*/
8201 i = map->level1[l1];
8202 if (i == 0xFF) {
8203 return -1;
8204 }
8205 /* level 2*/
8206 i = map->level23[16*i+l2];
8207 if (i == 0xFF) {
8208 return -1;
8209 }
8210 /* level 3 */
8211 i = map->level23[16*map->count2 + 128*i + l3];
8212 if (i == 0) {
8213 return -1;
8214 }
8215 return i;
8216}
8217
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008218/* Lookup the character ch in the mapping. If the character
8219 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008220 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008221static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008222charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008223{
Christian Heimes217cfd12007-12-02 14:31:20 +00008224 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008225 PyObject *x;
8226
8227 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008228 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008229 x = PyObject_GetItem(mapping, w);
8230 Py_DECREF(w);
8231 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008232 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8233 /* No mapping found means: mapping is undefined. */
8234 PyErr_Clear();
8235 x = Py_None;
8236 Py_INCREF(x);
8237 return x;
8238 } else
8239 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008240 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008241 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008242 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008243 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008244 long value = PyLong_AS_LONG(x);
8245 if (value < 0 || value > 255) {
8246 PyErr_SetString(PyExc_TypeError,
8247 "character mapping must be in range(256)");
8248 Py_DECREF(x);
8249 return NULL;
8250 }
8251 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008252 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008253 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008254 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008255 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008256 /* wrong return value */
8257 PyErr_Format(PyExc_TypeError,
8258 "character mapping must return integer, bytes or None, not %.400s",
8259 x->ob_type->tp_name);
8260 Py_DECREF(x);
8261 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008262 }
8263}
8264
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008265static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008266charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008267{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008268 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8269 /* exponentially overallocate to minimize reallocations */
8270 if (requiredsize < 2*outsize)
8271 requiredsize = 2*outsize;
8272 if (_PyBytes_Resize(outobj, requiredsize))
8273 return -1;
8274 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008275}
8276
Benjamin Peterson14339b62009-01-31 16:36:08 +00008277typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008278 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008279} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008280/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008281 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008282 space is available. Return a new reference to the object that
8283 was put in the output buffer, or Py_None, if the mapping was undefined
8284 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008285 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008286static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008287charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008288 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008289{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008290 PyObject *rep;
8291 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008292 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008293
Christian Heimes90aa7642007-12-19 02:45:37 +00008294 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008295 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008296 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008297 if (res == -1)
8298 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008299 if (outsize<requiredsize)
8300 if (charmapencode_resize(outobj, outpos, requiredsize))
8301 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008302 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008303 outstart[(*outpos)++] = (char)res;
8304 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008305 }
8306
8307 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008308 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008309 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008310 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008311 Py_DECREF(rep);
8312 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008313 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008314 if (PyLong_Check(rep)) {
8315 Py_ssize_t requiredsize = *outpos+1;
8316 if (outsize<requiredsize)
8317 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8318 Py_DECREF(rep);
8319 return enc_EXCEPTION;
8320 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008321 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008322 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008323 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008324 else {
8325 const char *repchars = PyBytes_AS_STRING(rep);
8326 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8327 Py_ssize_t requiredsize = *outpos+repsize;
8328 if (outsize<requiredsize)
8329 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8330 Py_DECREF(rep);
8331 return enc_EXCEPTION;
8332 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008333 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008334 memcpy(outstart + *outpos, repchars, repsize);
8335 *outpos += repsize;
8336 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008337 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008338 Py_DECREF(rep);
8339 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008340}
8341
8342/* handle an error in PyUnicode_EncodeCharmap
8343 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008344static int
8345charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008346 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008347 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008348 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008349 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008350{
8351 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008352 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008353 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008354 enum PyUnicode_Kind kind;
8355 void *data;
8356 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008357 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008358 Py_ssize_t collstartpos = *inpos;
8359 Py_ssize_t collendpos = *inpos+1;
8360 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008361 char *encoding = "charmap";
8362 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008363 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008364 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008365 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008366
Benjamin Petersonbac79492012-01-14 13:34:47 -05008367 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008368 return -1;
8369 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008370 /* find all unencodable characters */
8371 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008372 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008373 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008374 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008375 val = encoding_map_lookup(ch, mapping);
8376 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008377 break;
8378 ++collendpos;
8379 continue;
8380 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008381
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008382 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8383 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008384 if (rep==NULL)
8385 return -1;
8386 else if (rep!=Py_None) {
8387 Py_DECREF(rep);
8388 break;
8389 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008390 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008391 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008392 }
8393 /* cache callback name lookup
8394 * (if not done yet, i.e. it's the first error) */
8395 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008396 if ((errors==NULL) || (!strcmp(errors, "strict")))
8397 *known_errorHandler = 1;
8398 else if (!strcmp(errors, "replace"))
8399 *known_errorHandler = 2;
8400 else if (!strcmp(errors, "ignore"))
8401 *known_errorHandler = 3;
8402 else if (!strcmp(errors, "xmlcharrefreplace"))
8403 *known_errorHandler = 4;
8404 else
8405 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008406 }
8407 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008408 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008409 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008410 return -1;
8411 case 2: /* replace */
8412 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008413 x = charmapencode_output('?', mapping, res, respos);
8414 if (x==enc_EXCEPTION) {
8415 return -1;
8416 }
8417 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008418 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008419 return -1;
8420 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008421 }
8422 /* fall through */
8423 case 3: /* ignore */
8424 *inpos = collendpos;
8425 break;
8426 case 4: /* xmlcharrefreplace */
8427 /* generate replacement (temporarily (mis)uses p) */
8428 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008429 char buffer[2+29+1+1];
8430 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008431 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008432 for (cp = buffer; *cp; ++cp) {
8433 x = charmapencode_output(*cp, mapping, res, respos);
8434 if (x==enc_EXCEPTION)
8435 return -1;
8436 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008437 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008438 return -1;
8439 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008440 }
8441 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008442 *inpos = collendpos;
8443 break;
8444 default:
8445 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008446 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008447 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008448 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008449 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008450 if (PyBytes_Check(repunicode)) {
8451 /* Directly copy bytes result to output. */
8452 Py_ssize_t outsize = PyBytes_Size(*res);
8453 Py_ssize_t requiredsize;
8454 repsize = PyBytes_Size(repunicode);
8455 requiredsize = *respos + repsize;
8456 if (requiredsize > outsize)
8457 /* Make room for all additional bytes. */
8458 if (charmapencode_resize(res, respos, requiredsize)) {
8459 Py_DECREF(repunicode);
8460 return -1;
8461 }
8462 memcpy(PyBytes_AsString(*res) + *respos,
8463 PyBytes_AsString(repunicode), repsize);
8464 *respos += repsize;
8465 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008466 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008467 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008468 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008469 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008470 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008471 Py_DECREF(repunicode);
8472 return -1;
8473 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008474 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008475 data = PyUnicode_DATA(repunicode);
8476 kind = PyUnicode_KIND(repunicode);
8477 for (index = 0; index < repsize; index++) {
8478 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8479 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008480 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008481 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008482 return -1;
8483 }
8484 else if (x==enc_FAILED) {
8485 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008486 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008487 return -1;
8488 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008489 }
8490 *inpos = newpos;
8491 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008492 }
8493 return 0;
8494}
8495
Alexander Belopolsky40018472011-02-26 01:02:56 +00008496PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008497_PyUnicode_EncodeCharmap(PyObject *unicode,
8498 PyObject *mapping,
8499 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008500{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008501 /* output object */
8502 PyObject *res = NULL;
8503 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008504 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008505 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008506 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008507 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008508 PyObject *errorHandler = NULL;
8509 PyObject *exc = NULL;
8510 /* the following variable is used for caching string comparisons
8511 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8512 * 3=ignore, 4=xmlcharrefreplace */
8513 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008514
Benjamin Petersonbac79492012-01-14 13:34:47 -05008515 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008516 return NULL;
8517 size = PyUnicode_GET_LENGTH(unicode);
8518
Guido van Rossumd57fd912000-03-10 22:53:23 +00008519 /* Default to Latin-1 */
8520 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008521 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008522
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008523 /* allocate enough for a simple encoding without
8524 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008525 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008526 if (res == NULL)
8527 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008528 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008529 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008530
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008531 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008532 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008533 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008534 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008535 if (x==enc_EXCEPTION) /* error */
8536 goto onError;
8537 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008538 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008539 &exc,
8540 &known_errorHandler, &errorHandler, errors,
8541 &res, &respos)) {
8542 goto onError;
8543 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008544 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008545 else
8546 /* done with this character => adjust input position */
8547 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008548 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008549
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008550 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008551 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008552 if (_PyBytes_Resize(&res, respos) < 0)
8553 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008554
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008555 Py_XDECREF(exc);
8556 Py_XDECREF(errorHandler);
8557 return res;
8558
Benjamin Peterson29060642009-01-31 22:14:21 +00008559 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008560 Py_XDECREF(res);
8561 Py_XDECREF(exc);
8562 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008563 return NULL;
8564}
8565
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008566/* Deprecated */
8567PyObject *
8568PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8569 Py_ssize_t size,
8570 PyObject *mapping,
8571 const char *errors)
8572{
8573 PyObject *result;
8574 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8575 if (unicode == NULL)
8576 return NULL;
8577 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8578 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008579 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008580}
8581
Alexander Belopolsky40018472011-02-26 01:02:56 +00008582PyObject *
8583PyUnicode_AsCharmapString(PyObject *unicode,
8584 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008585{
8586 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008587 PyErr_BadArgument();
8588 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008589 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008590 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008591}
8592
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008593/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008594static void
8595make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008596 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008597 Py_ssize_t startpos, Py_ssize_t endpos,
8598 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008599{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008600 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008601 *exceptionObject = _PyUnicodeTranslateError_Create(
8602 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008603 }
8604 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008605 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8606 goto onError;
8607 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8608 goto onError;
8609 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8610 goto onError;
8611 return;
8612 onError:
8613 Py_DECREF(*exceptionObject);
8614 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008615 }
8616}
8617
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008618/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008619static void
8620raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008621 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008622 Py_ssize_t startpos, Py_ssize_t endpos,
8623 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008624{
8625 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008626 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008627 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008628 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008629}
8630
8631/* error handling callback helper:
8632 build arguments, call the callback and check the arguments,
8633 put the result into newpos and return the replacement string, which
8634 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008635static PyObject *
8636unicode_translate_call_errorhandler(const char *errors,
8637 PyObject **errorHandler,
8638 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008639 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008640 Py_ssize_t startpos, Py_ssize_t endpos,
8641 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008642{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008643 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008644
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008645 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008646 PyObject *restuple;
8647 PyObject *resunicode;
8648
8649 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008650 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008651 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008652 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008653 }
8654
8655 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008656 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008657 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008658 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008659
8660 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008661 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008662 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008663 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008664 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008665 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008666 Py_DECREF(restuple);
8667 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008668 }
8669 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008670 &resunicode, &i_newpos)) {
8671 Py_DECREF(restuple);
8672 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008673 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008674 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008675 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008676 else
8677 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008678 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008679 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8680 Py_DECREF(restuple);
8681 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008682 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008683 Py_INCREF(resunicode);
8684 Py_DECREF(restuple);
8685 return resunicode;
8686}
8687
8688/* Lookup the character ch in the mapping and put the result in result,
8689 which must be decrefed by the caller.
8690 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008691static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008692charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008693{
Christian Heimes217cfd12007-12-02 14:31:20 +00008694 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008695 PyObject *x;
8696
8697 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008698 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008699 x = PyObject_GetItem(mapping, w);
8700 Py_DECREF(w);
8701 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008702 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8703 /* No mapping found means: use 1:1 mapping. */
8704 PyErr_Clear();
8705 *result = NULL;
8706 return 0;
8707 } else
8708 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008709 }
8710 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008711 *result = x;
8712 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008713 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008714 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008715 long value = PyLong_AS_LONG(x);
8716 long max = PyUnicode_GetMax();
8717 if (value < 0 || value > max) {
8718 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008719 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008720 Py_DECREF(x);
8721 return -1;
8722 }
8723 *result = x;
8724 return 0;
8725 }
8726 else if (PyUnicode_Check(x)) {
8727 *result = x;
8728 return 0;
8729 }
8730 else {
8731 /* wrong return value */
8732 PyErr_SetString(PyExc_TypeError,
8733 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008734 Py_DECREF(x);
8735 return -1;
8736 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008737}
8738/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008739 if not reallocate and adjust various state variables.
8740 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008741static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008742charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008743 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008744{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008745 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00008746 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008747 /* exponentially overallocate to minimize reallocations */
8748 if (requiredsize < 2 * oldsize)
8749 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008750 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8751 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008752 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008753 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008754 }
8755 return 0;
8756}
8757/* lookup the character, put the result in the output string and adjust
8758 various state variables. Return a new reference to the object that
8759 was put in the output buffer in *result, or Py_None, if the mapping was
8760 undefined (in which case no character was written).
8761 The called must decref result.
8762 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008763static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008764charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8765 PyObject *mapping, Py_UCS4 **output,
8766 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008767 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008768{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008769 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8770 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008771 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008772 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008773 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008774 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008775 }
8776 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008777 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008778 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008779 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008780 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008781 }
8782 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008783 Py_ssize_t repsize;
8784 if (PyUnicode_READY(*res) == -1)
8785 return -1;
8786 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008787 if (repsize==1) {
8788 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008789 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008790 }
8791 else if (repsize!=0) {
8792 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008793 Py_ssize_t requiredsize = *opos +
8794 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008795 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008796 Py_ssize_t i;
8797 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008798 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008799 for(i = 0; i < repsize; i++)
8800 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008801 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008802 }
8803 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008804 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008805 return 0;
8806}
8807
Alexander Belopolsky40018472011-02-26 01:02:56 +00008808PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008809_PyUnicode_TranslateCharmap(PyObject *input,
8810 PyObject *mapping,
8811 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008812{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008813 /* input object */
8814 char *idata;
8815 Py_ssize_t size, i;
8816 int kind;
8817 /* output buffer */
8818 Py_UCS4 *output = NULL;
8819 Py_ssize_t osize;
8820 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008821 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008822 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008823 char *reason = "character maps to <undefined>";
8824 PyObject *errorHandler = NULL;
8825 PyObject *exc = NULL;
8826 /* the following variable is used for caching string comparisons
8827 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8828 * 3=ignore, 4=xmlcharrefreplace */
8829 int known_errorHandler = -1;
8830
Guido van Rossumd57fd912000-03-10 22:53:23 +00008831 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008832 PyErr_BadArgument();
8833 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008834 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008835
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008836 if (PyUnicode_READY(input) == -1)
8837 return NULL;
8838 idata = (char*)PyUnicode_DATA(input);
8839 kind = PyUnicode_KIND(input);
8840 size = PyUnicode_GET_LENGTH(input);
8841 i = 0;
8842
8843 if (size == 0) {
8844 Py_INCREF(input);
8845 return input;
8846 }
8847
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008848 /* allocate enough for a simple 1:1 translation without
8849 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008850 osize = size;
8851 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8852 opos = 0;
8853 if (output == NULL) {
8854 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008855 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008856 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008857
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008858 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008859 /* try to encode it */
8860 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008861 if (charmaptranslate_output(input, i, mapping,
8862 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008863 Py_XDECREF(x);
8864 goto onError;
8865 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008866 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008867 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008868 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008869 else { /* untranslatable character */
8870 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8871 Py_ssize_t repsize;
8872 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008873 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008874 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008875 Py_ssize_t collstart = i;
8876 Py_ssize_t collend = i+1;
8877 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008878
Benjamin Peterson29060642009-01-31 22:14:21 +00008879 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008880 while (collend < size) {
8881 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008882 goto onError;
8883 Py_XDECREF(x);
8884 if (x!=Py_None)
8885 break;
8886 ++collend;
8887 }
8888 /* cache callback name lookup
8889 * (if not done yet, i.e. it's the first error) */
8890 if (known_errorHandler==-1) {
8891 if ((errors==NULL) || (!strcmp(errors, "strict")))
8892 known_errorHandler = 1;
8893 else if (!strcmp(errors, "replace"))
8894 known_errorHandler = 2;
8895 else if (!strcmp(errors, "ignore"))
8896 known_errorHandler = 3;
8897 else if (!strcmp(errors, "xmlcharrefreplace"))
8898 known_errorHandler = 4;
8899 else
8900 known_errorHandler = 0;
8901 }
8902 switch (known_errorHandler) {
8903 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008904 raise_translate_exception(&exc, input, collstart,
8905 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008906 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008907 case 2: /* replace */
8908 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008909 for (coll = collstart; coll<collend; coll++)
8910 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008911 /* fall through */
8912 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008913 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008914 break;
8915 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008916 /* generate replacement (temporarily (mis)uses i) */
8917 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008918 char buffer[2+29+1+1];
8919 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008920 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8921 if (charmaptranslate_makespace(&output, &osize,
8922 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008923 goto onError;
8924 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008925 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008926 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008927 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008928 break;
8929 default:
8930 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008931 reason, input, &exc,
8932 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008933 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008934 goto onError;
Benjamin Peterson9ca3ffa2012-01-01 16:04:29 -06008935 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008936 Py_DECREF(repunicode);
8937 goto onError;
8938 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008939 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008940 repsize = PyUnicode_GET_LENGTH(repunicode);
8941 if (charmaptranslate_makespace(&output, &osize,
8942 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008943 Py_DECREF(repunicode);
8944 goto onError;
8945 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008946 for (uni2 = 0; repsize-->0; ++uni2)
8947 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8948 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008949 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008950 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008951 }
8952 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008953 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8954 if (!res)
8955 goto onError;
8956 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008957 Py_XDECREF(exc);
8958 Py_XDECREF(errorHandler);
8959 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008960
Benjamin Peterson29060642009-01-31 22:14:21 +00008961 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008962 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008963 Py_XDECREF(exc);
8964 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008965 return NULL;
8966}
8967
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008968/* Deprecated. Use PyUnicode_Translate instead. */
8969PyObject *
8970PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8971 Py_ssize_t size,
8972 PyObject *mapping,
8973 const char *errors)
8974{
8975 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8976 if (!unicode)
8977 return NULL;
8978 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8979}
8980
Alexander Belopolsky40018472011-02-26 01:02:56 +00008981PyObject *
8982PyUnicode_Translate(PyObject *str,
8983 PyObject *mapping,
8984 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008985{
8986 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008987
Guido van Rossumd57fd912000-03-10 22:53:23 +00008988 str = PyUnicode_FromObject(str);
8989 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008990 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008991 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008992 Py_DECREF(str);
8993 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008994
Benjamin Peterson29060642009-01-31 22:14:21 +00008995 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008996 Py_XDECREF(str);
8997 return NULL;
8998}
Tim Petersced69f82003-09-16 20:30:58 +00008999
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009000static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009001fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009002{
9003 /* No need to call PyUnicode_READY(self) because this function is only
9004 called as a callback from fixup() which does it already. */
9005 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9006 const int kind = PyUnicode_KIND(self);
9007 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02009008 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009009 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009010 Py_ssize_t i;
9011
9012 for (i = 0; i < len; ++i) {
9013 ch = PyUnicode_READ(kind, data, i);
9014 fixed = 0;
9015 if (ch > 127) {
9016 if (Py_UNICODE_ISSPACE(ch))
9017 fixed = ' ';
9018 else {
9019 const int decimal = Py_UNICODE_TODECIMAL(ch);
9020 if (decimal >= 0)
9021 fixed = '0' + decimal;
9022 }
9023 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009024 modified = 1;
Victor Stinnere6abb482012-05-02 01:15:40 +02009025 maxchar = MAX_MAXCHAR(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009026 PyUnicode_WRITE(kind, data, i, fixed);
9027 }
Victor Stinnere6abb482012-05-02 01:15:40 +02009028 else
9029 maxchar = MAX_MAXCHAR(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009030 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009031 }
9032
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009033 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009034}
9035
9036PyObject *
9037_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9038{
9039 if (!PyUnicode_Check(unicode)) {
9040 PyErr_BadInternalCall();
9041 return NULL;
9042 }
9043 if (PyUnicode_READY(unicode) == -1)
9044 return NULL;
9045 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
9046 /* If the string is already ASCII, just return the same string */
9047 Py_INCREF(unicode);
9048 return unicode;
9049 }
Victor Stinner9310abb2011-10-05 00:59:23 +02009050 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009051}
9052
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009053PyObject *
9054PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9055 Py_ssize_t length)
9056{
Victor Stinnerf0124502011-11-21 23:12:56 +01009057 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009058 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009059 Py_UCS4 maxchar;
9060 enum PyUnicode_Kind kind;
9061 void *data;
9062
Victor Stinner99d7ad02012-02-22 13:37:39 +01009063 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009064 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01009065 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009066 if (ch > 127) {
9067 int decimal = Py_UNICODE_TODECIMAL(ch);
9068 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009069 ch = '0' + decimal;
Victor Stinnere6abb482012-05-02 01:15:40 +02009070 maxchar = MAX_MAXCHAR(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009071 }
9072 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009073
9074 /* Copy to a new string */
9075 decimal = PyUnicode_New(length, maxchar);
9076 if (decimal == NULL)
9077 return decimal;
9078 kind = PyUnicode_KIND(decimal);
9079 data = PyUnicode_DATA(decimal);
9080 /* Iterate over code points */
9081 for (i = 0; i < length; i++) {
9082 Py_UNICODE ch = s[i];
9083 if (ch > 127) {
9084 int decimal = Py_UNICODE_TODECIMAL(ch);
9085 if (decimal >= 0)
9086 ch = '0' + decimal;
9087 }
9088 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009089 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009090 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009091}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009092/* --- Decimal Encoder ---------------------------------------------------- */
9093
Alexander Belopolsky40018472011-02-26 01:02:56 +00009094int
9095PyUnicode_EncodeDecimal(Py_UNICODE *s,
9096 Py_ssize_t length,
9097 char *output,
9098 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009099{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009100 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009101 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009102 enum PyUnicode_Kind kind;
9103 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009104
9105 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009106 PyErr_BadArgument();
9107 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009108 }
9109
Victor Stinner42bf7752011-11-21 22:52:58 +01009110 unicode = PyUnicode_FromUnicode(s, length);
9111 if (unicode == NULL)
9112 return -1;
9113
Benjamin Petersonbac79492012-01-14 13:34:47 -05009114 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01009115 Py_DECREF(unicode);
9116 return -1;
9117 }
Victor Stinner42bf7752011-11-21 22:52:58 +01009118 kind = PyUnicode_KIND(unicode);
9119 data = PyUnicode_DATA(unicode);
9120
Victor Stinnerb84d7232011-11-22 01:50:07 +01009121 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009122 PyObject *exc;
9123 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009124 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009125 Py_ssize_t startpos;
9126
9127 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009128
Benjamin Peterson29060642009-01-31 22:14:21 +00009129 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009130 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009131 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009132 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009133 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009134 decimal = Py_UNICODE_TODECIMAL(ch);
9135 if (decimal >= 0) {
9136 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009137 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009138 continue;
9139 }
9140 if (0 < ch && ch < 256) {
9141 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009142 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009143 continue;
9144 }
Victor Stinner6345be92011-11-25 20:09:01 +01009145
Victor Stinner42bf7752011-11-21 22:52:58 +01009146 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009147 exc = NULL;
9148 raise_encode_exception(&exc, "decimal", unicode,
9149 startpos, startpos+1,
9150 "invalid decimal Unicode string");
9151 Py_XDECREF(exc);
9152 Py_DECREF(unicode);
9153 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009154 }
9155 /* 0-terminate the output string */
9156 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009157 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009158 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009159}
9160
Guido van Rossumd57fd912000-03-10 22:53:23 +00009161/* --- Helpers ------------------------------------------------------------ */
9162
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009163static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02009164any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009165 Py_ssize_t start,
9166 Py_ssize_t end)
9167{
9168 int kind1, kind2, kind;
9169 void *buf1, *buf2;
9170 Py_ssize_t len1, len2, result;
9171
9172 kind1 = PyUnicode_KIND(s1);
9173 kind2 = PyUnicode_KIND(s2);
9174 kind = kind1 > kind2 ? kind1 : kind2;
9175 buf1 = PyUnicode_DATA(s1);
9176 buf2 = PyUnicode_DATA(s2);
9177 if (kind1 != kind)
9178 buf1 = _PyUnicode_AsKind(s1, kind);
9179 if (!buf1)
9180 return -2;
9181 if (kind2 != kind)
9182 buf2 = _PyUnicode_AsKind(s2, kind);
9183 if (!buf2) {
9184 if (kind1 != kind) PyMem_Free(buf1);
9185 return -2;
9186 }
9187 len1 = PyUnicode_GET_LENGTH(s1);
9188 len2 = PyUnicode_GET_LENGTH(s2);
9189
Victor Stinner794d5672011-10-10 03:21:36 +02009190 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06009191 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02009192 case PyUnicode_1BYTE_KIND:
9193 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9194 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9195 else
9196 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9197 break;
9198 case PyUnicode_2BYTE_KIND:
9199 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9200 break;
9201 case PyUnicode_4BYTE_KIND:
9202 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9203 break;
9204 default:
9205 assert(0); result = -2;
9206 }
9207 }
9208 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06009209 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02009210 case PyUnicode_1BYTE_KIND:
9211 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9212 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9213 else
9214 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9215 break;
9216 case PyUnicode_2BYTE_KIND:
9217 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9218 break;
9219 case PyUnicode_4BYTE_KIND:
9220 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9221 break;
9222 default:
9223 assert(0); result = -2;
9224 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009225 }
9226
9227 if (kind1 != kind)
9228 PyMem_Free(buf1);
9229 if (kind2 != kind)
9230 PyMem_Free(buf2);
9231
9232 return result;
9233}
9234
9235Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009236_PyUnicode_InsertThousandsGrouping(
9237 PyObject *unicode, Py_ssize_t index,
9238 Py_ssize_t n_buffer,
9239 void *digits, Py_ssize_t n_digits,
9240 Py_ssize_t min_width,
9241 const char *grouping, PyObject *thousands_sep,
9242 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009243{
Victor Stinner41a863c2012-02-24 00:37:51 +01009244 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009245 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009246 Py_ssize_t thousands_sep_len;
9247 Py_ssize_t len;
9248
9249 if (unicode != NULL) {
9250 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009251 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009252 }
9253 else {
9254 kind = PyUnicode_1BYTE_KIND;
9255 data = NULL;
9256 }
9257 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9258 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9259 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9260 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009261 if (thousands_sep_kind < kind) {
9262 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9263 if (!thousands_sep_data)
9264 return -1;
9265 }
9266 else {
9267 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9268 if (!data)
9269 return -1;
9270 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009271 }
9272
Benjamin Petersonead6b532011-12-20 17:23:42 -06009273 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009274 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009275 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009276 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009277 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009278 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009279 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009280 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009281 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009282 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009283 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009284 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009285 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009286 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009287 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009288 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009289 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009290 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009291 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009292 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009293 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009294 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009295 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009296 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009297 break;
9298 default:
9299 assert(0);
9300 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009301 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009302 if (unicode != NULL && thousands_sep_kind != kind) {
9303 if (thousands_sep_kind < kind)
9304 PyMem_Free(thousands_sep_data);
9305 else
9306 PyMem_Free(data);
9307 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009308 if (unicode == NULL) {
9309 *maxchar = 127;
9310 if (len != n_digits) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009311 *maxchar = MAX_MAXCHAR(*maxchar,
9312 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009313 }
9314 }
9315 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009316}
9317
9318
Thomas Wouters477c8d52006-05-27 19:21:47 +00009319/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009320#define ADJUST_INDICES(start, end, len) \
9321 if (end > len) \
9322 end = len; \
9323 else if (end < 0) { \
9324 end += len; \
9325 if (end < 0) \
9326 end = 0; \
9327 } \
9328 if (start < 0) { \
9329 start += len; \
9330 if (start < 0) \
9331 start = 0; \
9332 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009333
Alexander Belopolsky40018472011-02-26 01:02:56 +00009334Py_ssize_t
9335PyUnicode_Count(PyObject *str,
9336 PyObject *substr,
9337 Py_ssize_t start,
9338 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009339{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009340 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009341 PyObject* str_obj;
9342 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009343 int kind1, kind2, kind;
9344 void *buf1 = NULL, *buf2 = NULL;
9345 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009346
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009347 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009348 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00009349 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009350 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009351 if (!sub_obj) {
9352 Py_DECREF(str_obj);
9353 return -1;
9354 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06009355 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06009356 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00009357 Py_DECREF(str_obj);
9358 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009359 }
Tim Petersced69f82003-09-16 20:30:58 +00009360
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009361 kind1 = PyUnicode_KIND(str_obj);
9362 kind2 = PyUnicode_KIND(sub_obj);
9363 kind = kind1 > kind2 ? kind1 : kind2;
9364 buf1 = PyUnicode_DATA(str_obj);
9365 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009366 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009367 if (!buf1)
9368 goto onError;
9369 buf2 = PyUnicode_DATA(sub_obj);
9370 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009371 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009372 if (!buf2)
9373 goto onError;
9374 len1 = PyUnicode_GET_LENGTH(str_obj);
9375 len2 = PyUnicode_GET_LENGTH(sub_obj);
9376
9377 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06009378 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009379 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009380 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9381 result = asciilib_count(
9382 ((Py_UCS1*)buf1) + start, end - start,
9383 buf2, len2, PY_SSIZE_T_MAX
9384 );
9385 else
9386 result = ucs1lib_count(
9387 ((Py_UCS1*)buf1) + start, end - start,
9388 buf2, len2, PY_SSIZE_T_MAX
9389 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009390 break;
9391 case PyUnicode_2BYTE_KIND:
9392 result = ucs2lib_count(
9393 ((Py_UCS2*)buf1) + start, end - start,
9394 buf2, len2, PY_SSIZE_T_MAX
9395 );
9396 break;
9397 case PyUnicode_4BYTE_KIND:
9398 result = ucs4lib_count(
9399 ((Py_UCS4*)buf1) + start, end - start,
9400 buf2, len2, PY_SSIZE_T_MAX
9401 );
9402 break;
9403 default:
9404 assert(0); result = 0;
9405 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009406
9407 Py_DECREF(sub_obj);
9408 Py_DECREF(str_obj);
9409
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009410 if (kind1 != kind)
9411 PyMem_Free(buf1);
9412 if (kind2 != kind)
9413 PyMem_Free(buf2);
9414
Guido van Rossumd57fd912000-03-10 22:53:23 +00009415 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009416 onError:
9417 Py_DECREF(sub_obj);
9418 Py_DECREF(str_obj);
9419 if (kind1 != kind && buf1)
9420 PyMem_Free(buf1);
9421 if (kind2 != kind && buf2)
9422 PyMem_Free(buf2);
9423 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009424}
9425
Alexander Belopolsky40018472011-02-26 01:02:56 +00009426Py_ssize_t
9427PyUnicode_Find(PyObject *str,
9428 PyObject *sub,
9429 Py_ssize_t start,
9430 Py_ssize_t end,
9431 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009432{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009433 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009434
Guido van Rossumd57fd912000-03-10 22:53:23 +00009435 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009436 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009437 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009438 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009439 if (!sub) {
9440 Py_DECREF(str);
9441 return -2;
9442 }
9443 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9444 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009445 Py_DECREF(str);
9446 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009447 }
Tim Petersced69f82003-09-16 20:30:58 +00009448
Victor Stinner794d5672011-10-10 03:21:36 +02009449 result = any_find_slice(direction,
9450 str, sub, start, end
9451 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009452
Guido van Rossumd57fd912000-03-10 22:53:23 +00009453 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009454 Py_DECREF(sub);
9455
Guido van Rossumd57fd912000-03-10 22:53:23 +00009456 return result;
9457}
9458
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009459Py_ssize_t
9460PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9461 Py_ssize_t start, Py_ssize_t end,
9462 int direction)
9463{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009464 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009465 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009466 if (PyUnicode_READY(str) == -1)
9467 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009468 if (start < 0 || end < 0) {
9469 PyErr_SetString(PyExc_IndexError, "string index out of range");
9470 return -2;
9471 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009472 if (end > PyUnicode_GET_LENGTH(str))
9473 end = PyUnicode_GET_LENGTH(str);
9474 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009475 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9476 kind, end-start, ch, direction);
9477 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009478 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009479 else
9480 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009481}
9482
Alexander Belopolsky40018472011-02-26 01:02:56 +00009483static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009484tailmatch(PyObject *self,
9485 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009486 Py_ssize_t start,
9487 Py_ssize_t end,
9488 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009489{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009490 int kind_self;
9491 int kind_sub;
9492 void *data_self;
9493 void *data_sub;
9494 Py_ssize_t offset;
9495 Py_ssize_t i;
9496 Py_ssize_t end_sub;
9497
9498 if (PyUnicode_READY(self) == -1 ||
9499 PyUnicode_READY(substring) == -1)
9500 return 0;
9501
9502 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009503 return 1;
9504
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009505 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9506 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009507 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009508 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009509
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009510 kind_self = PyUnicode_KIND(self);
9511 data_self = PyUnicode_DATA(self);
9512 kind_sub = PyUnicode_KIND(substring);
9513 data_sub = PyUnicode_DATA(substring);
9514 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9515
9516 if (direction > 0)
9517 offset = end;
9518 else
9519 offset = start;
9520
9521 if (PyUnicode_READ(kind_self, data_self, offset) ==
9522 PyUnicode_READ(kind_sub, data_sub, 0) &&
9523 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9524 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9525 /* If both are of the same kind, memcmp is sufficient */
9526 if (kind_self == kind_sub) {
9527 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009528 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009529 data_sub,
9530 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009531 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009532 }
9533 /* otherwise we have to compare each character by first accesing it */
9534 else {
9535 /* We do not need to compare 0 and len(substring)-1 because
9536 the if statement above ensured already that they are equal
9537 when we end up here. */
9538 // TODO: honor direction and do a forward or backwards search
9539 for (i = 1; i < end_sub; ++i) {
9540 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9541 PyUnicode_READ(kind_sub, data_sub, i))
9542 return 0;
9543 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009544 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009545 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009546 }
9547
9548 return 0;
9549}
9550
Alexander Belopolsky40018472011-02-26 01:02:56 +00009551Py_ssize_t
9552PyUnicode_Tailmatch(PyObject *str,
9553 PyObject *substr,
9554 Py_ssize_t start,
9555 Py_ssize_t end,
9556 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009557{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009558 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009559
Guido van Rossumd57fd912000-03-10 22:53:23 +00009560 str = PyUnicode_FromObject(str);
9561 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009562 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009563 substr = PyUnicode_FromObject(substr);
9564 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009565 Py_DECREF(str);
9566 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009567 }
Tim Petersced69f82003-09-16 20:30:58 +00009568
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009569 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009570 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009571 Py_DECREF(str);
9572 Py_DECREF(substr);
9573 return result;
9574}
9575
Guido van Rossumd57fd912000-03-10 22:53:23 +00009576/* Apply fixfct filter to the Unicode object self and return a
9577 reference to the modified object */
9578
Alexander Belopolsky40018472011-02-26 01:02:56 +00009579static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009580fixup(PyObject *self,
9581 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009582{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009583 PyObject *u;
9584 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009585 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009586
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009587 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009588 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009589 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009590 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009591
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009592 /* fix functions return the new maximum character in a string,
9593 if the kind of the resulting unicode object does not change,
9594 everything is fine. Otherwise we need to change the string kind
9595 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009596 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009597
9598 if (maxchar_new == 0) {
9599 /* no changes */;
9600 if (PyUnicode_CheckExact(self)) {
9601 Py_DECREF(u);
9602 Py_INCREF(self);
9603 return self;
9604 }
9605 else
9606 return u;
9607 }
9608
Victor Stinnere6abb482012-05-02 01:15:40 +02009609 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009610
Victor Stinnereaab6042011-12-11 22:22:39 +01009611 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009612 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009613
9614 /* In case the maximum character changed, we need to
9615 convert the string to the new category. */
9616 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9617 if (v == NULL) {
9618 Py_DECREF(u);
9619 return NULL;
9620 }
9621 if (maxchar_new > maxchar_old) {
9622 /* If the maxchar increased so that the kind changed, not all
9623 characters are representable anymore and we need to fix the
9624 string again. This only happens in very few cases. */
9625 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
9626 maxchar_old = fixfct(v);
9627 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009628 }
9629 else {
Victor Stinnereaab6042011-12-11 22:22:39 +01009630 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009631 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009632 Py_DECREF(u);
9633 assert(_PyUnicode_CheckConsistency(v, 1));
9634 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009635}
9636
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009637static PyObject *
9638ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009639{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009640 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9641 char *resdata, *data = PyUnicode_DATA(self);
9642 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009643
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009644 res = PyUnicode_New(len, 127);
9645 if (res == NULL)
9646 return NULL;
9647 resdata = PyUnicode_DATA(res);
9648 if (lower)
9649 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009650 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009651 _Py_bytes_upper(resdata, data, len);
9652 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009653}
9654
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009655static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009656handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009657{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009658 Py_ssize_t j;
9659 int final_sigma;
9660 Py_UCS4 c;
9661 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009662
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009663 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9664
9665 where ! is a negation and \p{xxx} is a character with property xxx.
9666 */
9667 for (j = i - 1; j >= 0; j--) {
9668 c = PyUnicode_READ(kind, data, j);
9669 if (!_PyUnicode_IsCaseIgnorable(c))
9670 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009671 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009672 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9673 if (final_sigma) {
9674 for (j = i + 1; j < length; j++) {
9675 c = PyUnicode_READ(kind, data, j);
9676 if (!_PyUnicode_IsCaseIgnorable(c))
9677 break;
9678 }
9679 final_sigma = j == length || !_PyUnicode_IsCased(c);
9680 }
9681 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009682}
9683
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009684static int
9685lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9686 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009687{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009688 /* Obscure special case. */
9689 if (c == 0x3A3) {
9690 mapped[0] = handle_capital_sigma(kind, data, length, i);
9691 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009692 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009693 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009694}
9695
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009696static Py_ssize_t
9697do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009698{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009699 Py_ssize_t i, k = 0;
9700 int n_res, j;
9701 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009702
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009703 c = PyUnicode_READ(kind, data, 0);
9704 n_res = _PyUnicode_ToUpperFull(c, mapped);
9705 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009706 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009707 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009708 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009709 for (i = 1; i < length; i++) {
9710 c = PyUnicode_READ(kind, data, i);
9711 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9712 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009713 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009714 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009715 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009716 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009717 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009718}
9719
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009720static Py_ssize_t
9721do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9722 Py_ssize_t i, k = 0;
9723
9724 for (i = 0; i < length; i++) {
9725 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9726 int n_res, j;
9727 if (Py_UNICODE_ISUPPER(c)) {
9728 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9729 }
9730 else if (Py_UNICODE_ISLOWER(c)) {
9731 n_res = _PyUnicode_ToUpperFull(c, mapped);
9732 }
9733 else {
9734 n_res = 1;
9735 mapped[0] = c;
9736 }
9737 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009738 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009739 res[k++] = mapped[j];
9740 }
9741 }
9742 return k;
9743}
9744
9745static Py_ssize_t
9746do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9747 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009748{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009749 Py_ssize_t i, k = 0;
9750
9751 for (i = 0; i < length; i++) {
9752 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9753 int n_res, j;
9754 if (lower)
9755 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9756 else
9757 n_res = _PyUnicode_ToUpperFull(c, mapped);
9758 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009759 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009760 res[k++] = mapped[j];
9761 }
9762 }
9763 return k;
9764}
9765
9766static Py_ssize_t
9767do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9768{
9769 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9770}
9771
9772static Py_ssize_t
9773do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9774{
9775 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9776}
9777
Benjamin Petersone51757f2012-01-12 21:10:29 -05009778static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009779do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9780{
9781 Py_ssize_t i, k = 0;
9782
9783 for (i = 0; i < length; i++) {
9784 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9785 Py_UCS4 mapped[3];
9786 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9787 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009788 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009789 res[k++] = mapped[j];
9790 }
9791 }
9792 return k;
9793}
9794
9795static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009796do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9797{
9798 Py_ssize_t i, k = 0;
9799 int previous_is_cased;
9800
9801 previous_is_cased = 0;
9802 for (i = 0; i < length; i++) {
9803 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9804 Py_UCS4 mapped[3];
9805 int n_res, j;
9806
9807 if (previous_is_cased)
9808 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9809 else
9810 n_res = _PyUnicode_ToTitleFull(c, mapped);
9811
9812 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009813 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009814 res[k++] = mapped[j];
9815 }
9816
9817 previous_is_cased = _PyUnicode_IsCased(c);
9818 }
9819 return k;
9820}
9821
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009822static PyObject *
9823case_operation(PyObject *self,
9824 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9825{
9826 PyObject *res = NULL;
9827 Py_ssize_t length, newlength = 0;
9828 int kind, outkind;
9829 void *data, *outdata;
9830 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9831
Benjamin Petersoneea48462012-01-16 14:28:50 -05009832 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009833
9834 kind = PyUnicode_KIND(self);
9835 data = PyUnicode_DATA(self);
9836 length = PyUnicode_GET_LENGTH(self);
9837 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9838 if (tmp == NULL)
9839 return PyErr_NoMemory();
9840 newlength = perform(kind, data, length, tmp, &maxchar);
9841 res = PyUnicode_New(newlength, maxchar);
9842 if (res == NULL)
9843 goto leave;
9844 tmpend = tmp + newlength;
9845 outdata = PyUnicode_DATA(res);
9846 outkind = PyUnicode_KIND(res);
9847 switch (outkind) {
9848 case PyUnicode_1BYTE_KIND:
9849 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9850 break;
9851 case PyUnicode_2BYTE_KIND:
9852 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9853 break;
9854 case PyUnicode_4BYTE_KIND:
9855 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9856 break;
9857 default:
9858 assert(0);
9859 break;
9860 }
9861 leave:
9862 PyMem_FREE(tmp);
9863 return res;
9864}
9865
Tim Peters8ce9f162004-08-27 01:49:32 +00009866PyObject *
9867PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009868{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009869 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009870 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009871 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009872 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009873 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9874 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009875 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009876 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009877 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009878 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009879 int use_memcpy;
9880 unsigned char *res_data = NULL, *sep_data = NULL;
9881 PyObject *last_obj;
9882 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009883
Tim Peters05eba1f2004-08-27 21:32:02 +00009884 fseq = PySequence_Fast(seq, "");
9885 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009886 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009887 }
9888
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009889 /* NOTE: the following code can't call back into Python code,
9890 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009891 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009892
Tim Peters05eba1f2004-08-27 21:32:02 +00009893 seqlen = PySequence_Fast_GET_SIZE(fseq);
9894 /* If empty sequence, return u"". */
9895 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009896 Py_DECREF(fseq);
9897 Py_INCREF(unicode_empty);
9898 res = unicode_empty;
9899 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009900 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009901
Tim Peters05eba1f2004-08-27 21:32:02 +00009902 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009903 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009904 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009905 if (seqlen == 1) {
9906 if (PyUnicode_CheckExact(items[0])) {
9907 res = items[0];
9908 Py_INCREF(res);
9909 Py_DECREF(fseq);
9910 return res;
9911 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009912 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009913 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009914 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009915 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009916 /* Set up sep and seplen */
9917 if (separator == NULL) {
9918 /* fall back to a blank space separator */
9919 sep = PyUnicode_FromOrdinal(' ');
9920 if (!sep)
9921 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009922 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009923 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009924 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009925 else {
9926 if (!PyUnicode_Check(separator)) {
9927 PyErr_Format(PyExc_TypeError,
9928 "separator: expected str instance,"
9929 " %.80s found",
9930 Py_TYPE(separator)->tp_name);
9931 goto onError;
9932 }
9933 if (PyUnicode_READY(separator))
9934 goto onError;
9935 sep = separator;
9936 seplen = PyUnicode_GET_LENGTH(separator);
9937 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9938 /* inc refcount to keep this code path symmetric with the
9939 above case of a blank separator */
9940 Py_INCREF(sep);
9941 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009942 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009943 }
9944
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009945 /* There are at least two things to join, or else we have a subclass
9946 * of str in the sequence.
9947 * Do a pre-pass to figure out the total amount of space we'll
9948 * need (sz), and see whether all argument are strings.
9949 */
9950 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009951#ifdef Py_DEBUG
9952 use_memcpy = 0;
9953#else
9954 use_memcpy = 1;
9955#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009956 for (i = 0; i < seqlen; i++) {
9957 const Py_ssize_t old_sz = sz;
9958 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009959 if (!PyUnicode_Check(item)) {
9960 PyErr_Format(PyExc_TypeError,
9961 "sequence item %zd: expected str instance,"
9962 " %.80s found",
9963 i, Py_TYPE(item)->tp_name);
9964 goto onError;
9965 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009966 if (PyUnicode_READY(item) == -1)
9967 goto onError;
9968 sz += PyUnicode_GET_LENGTH(item);
9969 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnere6abb482012-05-02 01:15:40 +02009970 maxchar = MAX_MAXCHAR(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009971 if (i != 0)
9972 sz += seplen;
9973 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9974 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009975 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009976 goto onError;
9977 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009978 if (use_memcpy && last_obj != NULL) {
9979 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9980 use_memcpy = 0;
9981 }
9982 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009983 }
Tim Petersced69f82003-09-16 20:30:58 +00009984
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009985 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009986 if (res == NULL)
9987 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009988
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009989 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009990#ifdef Py_DEBUG
9991 use_memcpy = 0;
9992#else
9993 if (use_memcpy) {
9994 res_data = PyUnicode_1BYTE_DATA(res);
9995 kind = PyUnicode_KIND(res);
9996 if (seplen != 0)
9997 sep_data = PyUnicode_1BYTE_DATA(sep);
9998 }
9999#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010000 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010001 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010002 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010003 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +020010004 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +020010005 if (use_memcpy) {
10006 Py_MEMCPY(res_data,
10007 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010008 kind * seplen);
10009 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010010 }
10011 else {
10012 copy_characters(res, res_offset, sep, 0, seplen);
10013 res_offset += seplen;
10014 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010015 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010016 itemlen = PyUnicode_GET_LENGTH(item);
10017 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +020010018 if (use_memcpy) {
10019 Py_MEMCPY(res_data,
10020 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010021 kind * itemlen);
10022 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010023 }
10024 else {
10025 copy_characters(res, res_offset, item, 0, itemlen);
10026 res_offset += itemlen;
10027 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010028 }
Tim Peters05eba1f2004-08-27 21:32:02 +000010029 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010030 if (use_memcpy)
10031 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010032 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +020010033 else
10034 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +000010035
Tim Peters05eba1f2004-08-27 21:32:02 +000010036 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010037 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010038 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010039 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010040
Benjamin Peterson29060642009-01-31 22:14:21 +000010041 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +000010042 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010043 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010044 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010045 return NULL;
10046}
10047
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010048#define FILL(kind, data, value, start, length) \
10049 do { \
10050 Py_ssize_t i_ = 0; \
10051 assert(kind != PyUnicode_WCHAR_KIND); \
10052 switch ((kind)) { \
10053 case PyUnicode_1BYTE_KIND: { \
10054 unsigned char * to_ = (unsigned char *)((data)) + (start); \
10055 memset(to_, (unsigned char)value, length); \
10056 break; \
10057 } \
10058 case PyUnicode_2BYTE_KIND: { \
10059 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
10060 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10061 break; \
10062 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010063 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010064 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
10065 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10066 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010067 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010068 } \
10069 } \
10070 } while (0)
10071
Victor Stinner3fe55312012-01-04 00:33:50 +010010072Py_ssize_t
10073PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10074 Py_UCS4 fill_char)
10075{
10076 Py_ssize_t maxlen;
10077 enum PyUnicode_Kind kind;
10078 void *data;
10079
10080 if (!PyUnicode_Check(unicode)) {
10081 PyErr_BadInternalCall();
10082 return -1;
10083 }
10084 if (PyUnicode_READY(unicode) == -1)
10085 return -1;
10086 if (unicode_check_modifiable(unicode))
10087 return -1;
10088
10089 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10090 PyErr_SetString(PyExc_ValueError,
10091 "fill character is bigger than "
10092 "the string maximum character");
10093 return -1;
10094 }
10095
10096 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10097 length = Py_MIN(maxlen, length);
10098 if (length <= 0)
10099 return 0;
10100
10101 kind = PyUnicode_KIND(unicode);
10102 data = PyUnicode_DATA(unicode);
10103 FILL(kind, data, fill_char, start, length);
10104 return length;
10105}
10106
Victor Stinner9310abb2011-10-05 00:59:23 +020010107static PyObject *
10108pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010109 Py_ssize_t left,
10110 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010111 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010112{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010113 PyObject *u;
10114 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010115 int kind;
10116 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010117
10118 if (left < 0)
10119 left = 0;
10120 if (right < 0)
10121 right = 0;
10122
Victor Stinnerc4b49542011-12-11 22:44:26 +010010123 if (left == 0 && right == 0)
10124 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010125
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010126 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10127 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010128 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10129 return NULL;
10130 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010131 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Victor Stinnere6abb482012-05-02 01:15:40 +020010132 maxchar = MAX_MAXCHAR(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010133 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010134 if (!u)
10135 return NULL;
10136
10137 kind = PyUnicode_KIND(u);
10138 data = PyUnicode_DATA(u);
10139 if (left)
10140 FILL(kind, data, fill, 0, left);
10141 if (right)
10142 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010143 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010144 assert(_PyUnicode_CheckConsistency(u, 1));
10145 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010146}
10147
Alexander Belopolsky40018472011-02-26 01:02:56 +000010148PyObject *
10149PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010150{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010151 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010152
10153 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010154 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010155 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060010156 if (PyUnicode_READY(string) == -1) {
10157 Py_DECREF(string);
10158 return NULL;
10159 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010160
Benjamin Petersonead6b532011-12-20 17:23:42 -060010161 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010162 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010163 if (PyUnicode_IS_ASCII(string))
10164 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010165 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010166 PyUnicode_GET_LENGTH(string), keepends);
10167 else
10168 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010169 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010170 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010171 break;
10172 case PyUnicode_2BYTE_KIND:
10173 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010174 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010175 PyUnicode_GET_LENGTH(string), keepends);
10176 break;
10177 case PyUnicode_4BYTE_KIND:
10178 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010179 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010180 PyUnicode_GET_LENGTH(string), keepends);
10181 break;
10182 default:
10183 assert(0);
10184 list = 0;
10185 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010186 Py_DECREF(string);
10187 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010188}
10189
Alexander Belopolsky40018472011-02-26 01:02:56 +000010190static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010191split(PyObject *self,
10192 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010193 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010194{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010195 int kind1, kind2, kind;
10196 void *buf1, *buf2;
10197 Py_ssize_t len1, len2;
10198 PyObject* out;
10199
Guido van Rossumd57fd912000-03-10 22:53:23 +000010200 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010201 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010202
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010203 if (PyUnicode_READY(self) == -1)
10204 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010205
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010206 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010207 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010208 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010209 if (PyUnicode_IS_ASCII(self))
10210 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010211 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010212 PyUnicode_GET_LENGTH(self), maxcount
10213 );
10214 else
10215 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010216 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010217 PyUnicode_GET_LENGTH(self), maxcount
10218 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010219 case PyUnicode_2BYTE_KIND:
10220 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010221 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010222 PyUnicode_GET_LENGTH(self), maxcount
10223 );
10224 case PyUnicode_4BYTE_KIND:
10225 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010226 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010227 PyUnicode_GET_LENGTH(self), maxcount
10228 );
10229 default:
10230 assert(0);
10231 return NULL;
10232 }
10233
10234 if (PyUnicode_READY(substring) == -1)
10235 return NULL;
10236
10237 kind1 = PyUnicode_KIND(self);
10238 kind2 = PyUnicode_KIND(substring);
10239 kind = kind1 > kind2 ? kind1 : kind2;
10240 buf1 = PyUnicode_DATA(self);
10241 buf2 = PyUnicode_DATA(substring);
10242 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010243 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010244 if (!buf1)
10245 return NULL;
10246 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010247 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010248 if (!buf2) {
10249 if (kind1 != kind) PyMem_Free(buf1);
10250 return NULL;
10251 }
10252 len1 = PyUnicode_GET_LENGTH(self);
10253 len2 = PyUnicode_GET_LENGTH(substring);
10254
Benjamin Petersonead6b532011-12-20 17:23:42 -060010255 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010256 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010257 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10258 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010259 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010260 else
10261 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010262 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010263 break;
10264 case PyUnicode_2BYTE_KIND:
10265 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010266 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010267 break;
10268 case PyUnicode_4BYTE_KIND:
10269 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010270 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010271 break;
10272 default:
10273 out = NULL;
10274 }
10275 if (kind1 != kind)
10276 PyMem_Free(buf1);
10277 if (kind2 != kind)
10278 PyMem_Free(buf2);
10279 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010280}
10281
Alexander Belopolsky40018472011-02-26 01:02:56 +000010282static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010283rsplit(PyObject *self,
10284 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010285 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010286{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010287 int kind1, kind2, kind;
10288 void *buf1, *buf2;
10289 Py_ssize_t len1, len2;
10290 PyObject* out;
10291
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010292 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010293 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010294
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010295 if (PyUnicode_READY(self) == -1)
10296 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010297
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010298 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010299 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010300 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010301 if (PyUnicode_IS_ASCII(self))
10302 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010303 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010304 PyUnicode_GET_LENGTH(self), maxcount
10305 );
10306 else
10307 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010308 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010309 PyUnicode_GET_LENGTH(self), maxcount
10310 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010311 case PyUnicode_2BYTE_KIND:
10312 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010313 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010314 PyUnicode_GET_LENGTH(self), maxcount
10315 );
10316 case PyUnicode_4BYTE_KIND:
10317 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010318 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010319 PyUnicode_GET_LENGTH(self), maxcount
10320 );
10321 default:
10322 assert(0);
10323 return NULL;
10324 }
10325
10326 if (PyUnicode_READY(substring) == -1)
10327 return NULL;
10328
10329 kind1 = PyUnicode_KIND(self);
10330 kind2 = PyUnicode_KIND(substring);
10331 kind = kind1 > kind2 ? kind1 : kind2;
10332 buf1 = PyUnicode_DATA(self);
10333 buf2 = PyUnicode_DATA(substring);
10334 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010335 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010336 if (!buf1)
10337 return NULL;
10338 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010339 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010340 if (!buf2) {
10341 if (kind1 != kind) PyMem_Free(buf1);
10342 return NULL;
10343 }
10344 len1 = PyUnicode_GET_LENGTH(self);
10345 len2 = PyUnicode_GET_LENGTH(substring);
10346
Benjamin Petersonead6b532011-12-20 17:23:42 -060010347 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010348 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010349 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10350 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010351 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010352 else
10353 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010354 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010355 break;
10356 case PyUnicode_2BYTE_KIND:
10357 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010358 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010359 break;
10360 case PyUnicode_4BYTE_KIND:
10361 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010362 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010363 break;
10364 default:
10365 out = NULL;
10366 }
10367 if (kind1 != kind)
10368 PyMem_Free(buf1);
10369 if (kind2 != kind)
10370 PyMem_Free(buf2);
10371 return out;
10372}
10373
10374static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010375anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10376 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010377{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010378 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010379 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010380 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10381 return asciilib_find(buf1, len1, buf2, len2, offset);
10382 else
10383 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010384 case PyUnicode_2BYTE_KIND:
10385 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10386 case PyUnicode_4BYTE_KIND:
10387 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10388 }
10389 assert(0);
10390 return -1;
10391}
10392
10393static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010394anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10395 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010396{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010397 switch (kind) {
10398 case PyUnicode_1BYTE_KIND:
10399 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10400 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10401 else
10402 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10403 case PyUnicode_2BYTE_KIND:
10404 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10405 case PyUnicode_4BYTE_KIND:
10406 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10407 }
10408 assert(0);
10409 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010410}
10411
Alexander Belopolsky40018472011-02-26 01:02:56 +000010412static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010413replace(PyObject *self, PyObject *str1,
10414 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010415{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010416 PyObject *u;
10417 char *sbuf = PyUnicode_DATA(self);
10418 char *buf1 = PyUnicode_DATA(str1);
10419 char *buf2 = PyUnicode_DATA(str2);
10420 int srelease = 0, release1 = 0, release2 = 0;
10421 int skind = PyUnicode_KIND(self);
10422 int kind1 = PyUnicode_KIND(str1);
10423 int kind2 = PyUnicode_KIND(str2);
10424 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10425 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10426 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010427 int mayshrink;
10428 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010429
10430 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010431 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010432 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010433 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010434
Victor Stinner59de0ee2011-10-07 10:01:28 +020010435 if (str1 == str2)
10436 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010437 if (skind < kind1)
10438 /* substring too wide to be present */
10439 goto nothing;
10440
Victor Stinner49a0a212011-10-12 23:46:10 +020010441 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10442 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10443 /* Replacing str1 with str2 may cause a maxchar reduction in the
10444 result string. */
10445 mayshrink = (maxchar_str2 < maxchar);
Victor Stinnere6abb482012-05-02 01:15:40 +020010446 maxchar = MAX_MAXCHAR(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010447
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010448 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010449 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010450 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010451 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010452 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010453 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010454 Py_UCS4 u1, u2;
10455 int rkind;
Victor Stinnerf6441102011-12-18 02:43:08 +010010456 Py_ssize_t index, pos;
10457 char *src;
10458
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010459 u1 = PyUnicode_READ_CHAR(str1, 0);
Victor Stinnerf6441102011-12-18 02:43:08 +010010460 pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1);
10461 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010462 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010463 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010464 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010465 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010466 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010467 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010468 rkind = PyUnicode_KIND(u);
Victor Stinnerf6441102011-12-18 02:43:08 +010010469
10470 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2);
10471 index = 0;
10472 src = sbuf;
10473 while (--maxcount)
10474 {
10475 pos++;
10476 src += pos * PyUnicode_KIND(self);
10477 slen -= pos;
10478 index += pos;
10479 pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1);
10480 if (pos < 0)
10481 break;
10482 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2);
10483 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010484 }
10485 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010486 int rkind = skind;
10487 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010488 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010489
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010490 if (kind1 < rkind) {
10491 /* widen substring */
10492 buf1 = _PyUnicode_AsKind(str1, rkind);
10493 if (!buf1) goto error;
10494 release1 = 1;
10495 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010496 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010497 if (i < 0)
10498 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010499 if (rkind > kind2) {
10500 /* widen replacement */
10501 buf2 = _PyUnicode_AsKind(str2, rkind);
10502 if (!buf2) goto error;
10503 release2 = 1;
10504 }
10505 else if (rkind < kind2) {
10506 /* widen self and buf1 */
10507 rkind = kind2;
10508 if (release1) PyMem_Free(buf1);
10509 sbuf = _PyUnicode_AsKind(self, rkind);
10510 if (!sbuf) goto error;
10511 srelease = 1;
10512 buf1 = _PyUnicode_AsKind(str1, rkind);
10513 if (!buf1) goto error;
10514 release1 = 1;
10515 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010516 u = PyUnicode_New(slen, maxchar);
10517 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010518 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010519 assert(PyUnicode_KIND(u) == rkind);
10520 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010521
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010522 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010523 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010524 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010525 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010526 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010527 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010528
10529 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010530 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010531 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010532 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010533 if (i == -1)
10534 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010535 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010536 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010537 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010538 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010539 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010540 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010541 }
10542 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010543 Py_ssize_t n, i, j, ires;
10544 Py_ssize_t product, new_size;
10545 int rkind = skind;
10546 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010547
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010548 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010549 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010550 buf1 = _PyUnicode_AsKind(str1, rkind);
10551 if (!buf1) goto error;
10552 release1 = 1;
10553 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010554 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010555 if (n == 0)
10556 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010557 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010558 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010559 buf2 = _PyUnicode_AsKind(str2, rkind);
10560 if (!buf2) goto error;
10561 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010562 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010563 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010564 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010565 rkind = kind2;
10566 sbuf = _PyUnicode_AsKind(self, rkind);
10567 if (!sbuf) goto error;
10568 srelease = 1;
10569 if (release1) PyMem_Free(buf1);
10570 buf1 = _PyUnicode_AsKind(str1, rkind);
10571 if (!buf1) goto error;
10572 release1 = 1;
10573 }
10574 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10575 PyUnicode_GET_LENGTH(str1))); */
10576 product = n * (len2-len1);
10577 if ((product / (len2-len1)) != n) {
10578 PyErr_SetString(PyExc_OverflowError,
10579 "replace string is too long");
10580 goto error;
10581 }
10582 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +020010583 if (new_size == 0) {
10584 Py_INCREF(unicode_empty);
10585 u = unicode_empty;
10586 goto done;
10587 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010588 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10589 PyErr_SetString(PyExc_OverflowError,
10590 "replace string is too long");
10591 goto error;
10592 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010593 u = PyUnicode_New(new_size, maxchar);
10594 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010595 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010596 assert(PyUnicode_KIND(u) == rkind);
10597 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010598 ires = i = 0;
10599 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010600 while (n-- > 0) {
10601 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010602 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010603 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010604 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010605 if (j == -1)
10606 break;
10607 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010608 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010609 memcpy(res + rkind * ires,
10610 sbuf + rkind * i,
10611 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010612 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010613 }
10614 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010615 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010616 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010617 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010618 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010619 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010620 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010621 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010622 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010623 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010624 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010625 memcpy(res + rkind * ires,
10626 sbuf + rkind * i,
10627 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010628 }
10629 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010630 /* interleave */
10631 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010632 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010633 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010634 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010635 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010636 if (--n <= 0)
10637 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010638 memcpy(res + rkind * ires,
10639 sbuf + rkind * i,
10640 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010641 ires++;
10642 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010643 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010644 memcpy(res + rkind * ires,
10645 sbuf + rkind * i,
10646 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010647 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010648 }
10649
10650 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010651 unicode_adjust_maxchar(&u);
10652 if (u == NULL)
10653 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010654 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010655
10656 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010657 if (srelease)
10658 PyMem_FREE(sbuf);
10659 if (release1)
10660 PyMem_FREE(buf1);
10661 if (release2)
10662 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010663 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010664 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010665
Benjamin Peterson29060642009-01-31 22:14:21 +000010666 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010667 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010668 if (srelease)
10669 PyMem_FREE(sbuf);
10670 if (release1)
10671 PyMem_FREE(buf1);
10672 if (release2)
10673 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010674 return unicode_result_unchanged(self);
10675
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010676 error:
10677 if (srelease && sbuf)
10678 PyMem_FREE(sbuf);
10679 if (release1 && buf1)
10680 PyMem_FREE(buf1);
10681 if (release2 && buf2)
10682 PyMem_FREE(buf2);
10683 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010684}
10685
10686/* --- Unicode Object Methods --------------------------------------------- */
10687
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010688PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010689 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010690\n\
10691Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010692characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010693
10694static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010695unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010696{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010697 if (PyUnicode_READY(self) == -1)
10698 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010699 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010700}
10701
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010702PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010703 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010704\n\
10705Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010706have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010707
10708static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010709unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010710{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010711 if (PyUnicode_READY(self) == -1)
10712 return NULL;
10713 if (PyUnicode_GET_LENGTH(self) == 0)
10714 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010715 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010716}
10717
Benjamin Petersond5890c82012-01-14 13:23:30 -050010718PyDoc_STRVAR(casefold__doc__,
10719 "S.casefold() -> str\n\
10720\n\
10721Return a version of S suitable for caseless comparisons.");
10722
10723static PyObject *
10724unicode_casefold(PyObject *self)
10725{
10726 if (PyUnicode_READY(self) == -1)
10727 return NULL;
10728 if (PyUnicode_IS_ASCII(self))
10729 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010730 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010731}
10732
10733
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010734/* Argument converter. Coerces to a single unicode character */
10735
10736static int
10737convert_uc(PyObject *obj, void *addr)
10738{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010739 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010740 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010741
Benjamin Peterson14339b62009-01-31 16:36:08 +000010742 uniobj = PyUnicode_FromObject(obj);
10743 if (uniobj == NULL) {
10744 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010745 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010746 return 0;
10747 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010748 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010749 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010750 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010751 Py_DECREF(uniobj);
10752 return 0;
10753 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010754 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010755 Py_DECREF(uniobj);
10756 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010757}
10758
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010759PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010760 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010761\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010762Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010763done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010764
10765static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010766unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010767{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010768 Py_ssize_t marg, left;
10769 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010770 Py_UCS4 fillchar = ' ';
10771
Victor Stinnere9a29352011-10-01 02:14:59 +020010772 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010773 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010774
Benjamin Petersonbac79492012-01-14 13:34:47 -050010775 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010776 return NULL;
10777
Victor Stinnerc4b49542011-12-11 22:44:26 +010010778 if (PyUnicode_GET_LENGTH(self) >= width)
10779 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010780
Victor Stinnerc4b49542011-12-11 22:44:26 +010010781 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010782 left = marg / 2 + (marg & width & 1);
10783
Victor Stinner9310abb2011-10-05 00:59:23 +020010784 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010785}
10786
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010787/* This function assumes that str1 and str2 are readied by the caller. */
10788
Marc-André Lemburge5034372000-08-08 08:04:29 +000010789static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010790unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010791{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010792 int kind1, kind2;
10793 void *data1, *data2;
10794 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010795
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010796 kind1 = PyUnicode_KIND(str1);
10797 kind2 = PyUnicode_KIND(str2);
10798 data1 = PyUnicode_DATA(str1);
10799 data2 = PyUnicode_DATA(str2);
10800 len1 = PyUnicode_GET_LENGTH(str1);
10801 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010802
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010803 for (i = 0; i < len1 && i < len2; ++i) {
10804 Py_UCS4 c1, c2;
10805 c1 = PyUnicode_READ(kind1, data1, i);
10806 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010807
10808 if (c1 != c2)
10809 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010810 }
10811
10812 return (len1 < len2) ? -1 : (len1 != len2);
10813}
10814
Alexander Belopolsky40018472011-02-26 01:02:56 +000010815int
10816PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010817{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010818 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10819 if (PyUnicode_READY(left) == -1 ||
10820 PyUnicode_READY(right) == -1)
10821 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010822 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010823 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010824 PyErr_Format(PyExc_TypeError,
10825 "Can't compare %.100s and %.100s",
10826 left->ob_type->tp_name,
10827 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010828 return -1;
10829}
10830
Martin v. Löwis5b222132007-06-10 09:51:05 +000010831int
10832PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10833{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010834 Py_ssize_t i;
10835 int kind;
10836 void *data;
10837 Py_UCS4 chr;
10838
Victor Stinner910337b2011-10-03 03:20:16 +020010839 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010840 if (PyUnicode_READY(uni) == -1)
10841 return -1;
10842 kind = PyUnicode_KIND(uni);
10843 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010844 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010845 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10846 if (chr != str[i])
10847 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010848 /* This check keeps Python strings that end in '\0' from comparing equal
10849 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010850 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010851 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010852 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010853 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010854 return 0;
10855}
10856
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010857
Benjamin Peterson29060642009-01-31 22:14:21 +000010858#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010859 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010860
Alexander Belopolsky40018472011-02-26 01:02:56 +000010861PyObject *
10862PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010863{
10864 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010865
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010866 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10867 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010868 if (PyUnicode_READY(left) == -1 ||
10869 PyUnicode_READY(right) == -1)
10870 return NULL;
10871 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10872 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010873 if (op == Py_EQ) {
10874 Py_INCREF(Py_False);
10875 return Py_False;
10876 }
10877 if (op == Py_NE) {
10878 Py_INCREF(Py_True);
10879 return Py_True;
10880 }
10881 }
10882 if (left == right)
10883 result = 0;
10884 else
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010885 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010886
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010887 /* Convert the return value to a Boolean */
10888 switch (op) {
10889 case Py_EQ:
10890 v = TEST_COND(result == 0);
10891 break;
10892 case Py_NE:
10893 v = TEST_COND(result != 0);
10894 break;
10895 case Py_LE:
10896 v = TEST_COND(result <= 0);
10897 break;
10898 case Py_GE:
10899 v = TEST_COND(result >= 0);
10900 break;
10901 case Py_LT:
10902 v = TEST_COND(result == -1);
10903 break;
10904 case Py_GT:
10905 v = TEST_COND(result == 1);
10906 break;
10907 default:
10908 PyErr_BadArgument();
10909 return NULL;
10910 }
10911 Py_INCREF(v);
10912 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010913 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010914
Brian Curtindfc80e32011-08-10 20:28:54 -050010915 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010916}
10917
Alexander Belopolsky40018472011-02-26 01:02:56 +000010918int
10919PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010920{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010921 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010922 int kind1, kind2, kind;
10923 void *buf1, *buf2;
10924 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010925 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010926
10927 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010928 sub = PyUnicode_FromObject(element);
10929 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010930 PyErr_Format(PyExc_TypeError,
10931 "'in <string>' requires string as left operand, not %s",
10932 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010933 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010934 }
10935
Thomas Wouters477c8d52006-05-27 19:21:47 +000010936 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010937 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010938 Py_DECREF(sub);
10939 return -1;
10940 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060010941 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
10942 Py_DECREF(sub);
10943 Py_DECREF(str);
10944 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010945
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010946 kind1 = PyUnicode_KIND(str);
10947 kind2 = PyUnicode_KIND(sub);
10948 kind = kind1 > kind2 ? kind1 : kind2;
10949 buf1 = PyUnicode_DATA(str);
10950 buf2 = PyUnicode_DATA(sub);
10951 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010952 buf1 = _PyUnicode_AsKind(str, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010953 if (!buf1) {
10954 Py_DECREF(sub);
10955 return -1;
10956 }
10957 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010958 buf2 = _PyUnicode_AsKind(sub, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010959 if (!buf2) {
10960 Py_DECREF(sub);
10961 if (kind1 != kind) PyMem_Free(buf1);
10962 return -1;
10963 }
10964 len1 = PyUnicode_GET_LENGTH(str);
10965 len2 = PyUnicode_GET_LENGTH(sub);
10966
Benjamin Petersonead6b532011-12-20 17:23:42 -060010967 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010968 case PyUnicode_1BYTE_KIND:
10969 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10970 break;
10971 case PyUnicode_2BYTE_KIND:
10972 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10973 break;
10974 case PyUnicode_4BYTE_KIND:
10975 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10976 break;
10977 default:
10978 result = -1;
10979 assert(0);
10980 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010981
10982 Py_DECREF(str);
10983 Py_DECREF(sub);
10984
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010985 if (kind1 != kind)
10986 PyMem_Free(buf1);
10987 if (kind2 != kind)
10988 PyMem_Free(buf2);
10989
Guido van Rossum403d68b2000-03-13 15:55:09 +000010990 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010991}
10992
Guido van Rossumd57fd912000-03-10 22:53:23 +000010993/* Concat to string or Unicode object giving a new Unicode object. */
10994
Alexander Belopolsky40018472011-02-26 01:02:56 +000010995PyObject *
10996PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010997{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010998 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010999 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010011000 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011001
11002 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011003 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011004 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011005 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011006 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011007 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011008 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011009
11010 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020011011 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011012 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011013 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011014 }
Victor Stinnera464fc12011-10-02 20:39:30 +020011015 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011016 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011017 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011018 }
11019
Victor Stinner488fa492011-12-12 00:01:39 +010011020 u_len = PyUnicode_GET_LENGTH(u);
11021 v_len = PyUnicode_GET_LENGTH(v);
11022 if (u_len > PY_SSIZE_T_MAX - v_len) {
11023 PyErr_SetString(PyExc_OverflowError,
11024 "strings are too large to concat");
11025 goto onError;
11026 }
11027 new_len = u_len + v_len;
11028
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011029 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020011030 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Victor Stinnere6abb482012-05-02 01:15:40 +020011031 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011032
Guido van Rossumd57fd912000-03-10 22:53:23 +000011033 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010011034 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011035 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011036 goto onError;
Victor Stinner488fa492011-12-12 00:01:39 +010011037 copy_characters(w, 0, u, 0, u_len);
11038 copy_characters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011039 Py_DECREF(u);
11040 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011041 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011042 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011043
Benjamin Peterson29060642009-01-31 22:14:21 +000011044 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011045 Py_XDECREF(u);
11046 Py_XDECREF(v);
11047 return NULL;
11048}
11049
Walter Dörwald1ab83302007-05-18 17:15:44 +000011050void
Victor Stinner23e56682011-10-03 03:54:37 +020011051PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011052{
Victor Stinner23e56682011-10-03 03:54:37 +020011053 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011054 Py_UCS4 maxchar, maxchar2;
11055 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011056
11057 if (p_left == NULL) {
11058 if (!PyErr_Occurred())
11059 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011060 return;
11061 }
Victor Stinner23e56682011-10-03 03:54:37 +020011062 left = *p_left;
11063 if (right == NULL || !PyUnicode_Check(left)) {
11064 if (!PyErr_Occurred())
11065 PyErr_BadInternalCall();
11066 goto error;
11067 }
11068
Benjamin Petersonbac79492012-01-14 13:34:47 -050011069 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011070 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011071 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011072 goto error;
11073
Victor Stinner488fa492011-12-12 00:01:39 +010011074 /* Shortcuts */
11075 if (left == unicode_empty) {
11076 Py_DECREF(left);
11077 Py_INCREF(right);
11078 *p_left = right;
11079 return;
11080 }
11081 if (right == unicode_empty)
11082 return;
11083
11084 left_len = PyUnicode_GET_LENGTH(left);
11085 right_len = PyUnicode_GET_LENGTH(right);
11086 if (left_len > PY_SSIZE_T_MAX - right_len) {
11087 PyErr_SetString(PyExc_OverflowError,
11088 "strings are too large to concat");
11089 goto error;
11090 }
11091 new_len = left_len + right_len;
11092
11093 if (unicode_modifiable(left)
11094 && PyUnicode_CheckExact(right)
11095 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011096 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11097 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011098 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011099 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011100 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11101 {
11102 /* append inplace */
11103 if (unicode_resize(p_left, new_len) != 0) {
11104 /* XXX if _PyUnicode_Resize() fails, 'left' has been
11105 * deallocated so it cannot be put back into
11106 * 'variable'. The MemoryError is raised when there
11107 * is no value in 'variable', which might (very
11108 * remotely) be a cause of incompatibilities.
11109 */
11110 goto error;
Victor Stinner23e56682011-10-03 03:54:37 +020011111 }
Victor Stinner488fa492011-12-12 00:01:39 +010011112 /* copy 'right' into the newly allocated area of 'left' */
11113 copy_characters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011114 }
Victor Stinner488fa492011-12-12 00:01:39 +010011115 else {
11116 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11117 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Victor Stinnere6abb482012-05-02 01:15:40 +020011118 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011119
Victor Stinner488fa492011-12-12 00:01:39 +010011120 /* Concat the two Unicode strings */
11121 res = PyUnicode_New(new_len, maxchar);
11122 if (res == NULL)
11123 goto error;
11124 copy_characters(res, 0, left, 0, left_len);
11125 copy_characters(res, left_len, right, 0, right_len);
11126 Py_DECREF(left);
11127 *p_left = res;
11128 }
11129 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011130 return;
11131
11132error:
Victor Stinner488fa492011-12-12 00:01:39 +010011133 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011134}
11135
11136void
11137PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11138{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011139 PyUnicode_Append(pleft, right);
11140 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011141}
11142
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011143PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011144 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011145\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011146Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011147string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011148interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011149
11150static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011151unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011152{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011153 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011154 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011155 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011156 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011157 int kind1, kind2, kind;
11158 void *buf1, *buf2;
11159 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011160
Jesus Ceaac451502011-04-20 17:09:23 +020011161 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
11162 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011163 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011164
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011165 kind1 = PyUnicode_KIND(self);
11166 kind2 = PyUnicode_KIND(substring);
11167 kind = kind1 > kind2 ? kind1 : kind2;
11168 buf1 = PyUnicode_DATA(self);
11169 buf2 = PyUnicode_DATA(substring);
11170 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010011171 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011172 if (!buf1) {
11173 Py_DECREF(substring);
11174 return NULL;
11175 }
11176 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010011177 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011178 if (!buf2) {
11179 Py_DECREF(substring);
11180 if (kind1 != kind) PyMem_Free(buf1);
11181 return NULL;
11182 }
11183 len1 = PyUnicode_GET_LENGTH(self);
11184 len2 = PyUnicode_GET_LENGTH(substring);
11185
11186 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060011187 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011188 case PyUnicode_1BYTE_KIND:
11189 iresult = ucs1lib_count(
11190 ((Py_UCS1*)buf1) + start, end - start,
11191 buf2, len2, PY_SSIZE_T_MAX
11192 );
11193 break;
11194 case PyUnicode_2BYTE_KIND:
11195 iresult = ucs2lib_count(
11196 ((Py_UCS2*)buf1) + start, end - start,
11197 buf2, len2, PY_SSIZE_T_MAX
11198 );
11199 break;
11200 case PyUnicode_4BYTE_KIND:
11201 iresult = ucs4lib_count(
11202 ((Py_UCS4*)buf1) + start, end - start,
11203 buf2, len2, PY_SSIZE_T_MAX
11204 );
11205 break;
11206 default:
11207 assert(0); iresult = 0;
11208 }
11209
11210 result = PyLong_FromSsize_t(iresult);
11211
11212 if (kind1 != kind)
11213 PyMem_Free(buf1);
11214 if (kind2 != kind)
11215 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011216
11217 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011218
Guido van Rossumd57fd912000-03-10 22:53:23 +000011219 return result;
11220}
11221
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011222PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011223 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011224\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011225Encode S using the codec registered for encoding. Default encoding\n\
11226is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011227handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011228a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11229'xmlcharrefreplace' as well as any other name registered with\n\
11230codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011231
11232static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011233unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011234{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011235 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011236 char *encoding = NULL;
11237 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011238
Benjamin Peterson308d6372009-09-18 21:42:35 +000011239 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11240 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011241 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011242 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011243}
11244
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011245PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011246 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011247\n\
11248Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011249If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011250
11251static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011252unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011253{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011254 Py_ssize_t i, j, line_pos, src_len, incr;
11255 Py_UCS4 ch;
11256 PyObject *u;
11257 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011258 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011259 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011260 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011261
11262 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011263 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011264
Antoine Pitrou22425222011-10-04 19:10:51 +020011265 if (PyUnicode_READY(self) == -1)
11266 return NULL;
11267
Thomas Wouters7e474022000-07-16 12:04:32 +000011268 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011269 src_len = PyUnicode_GET_LENGTH(self);
11270 i = j = line_pos = 0;
11271 kind = PyUnicode_KIND(self);
11272 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011273 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011274 for (; i < src_len; i++) {
11275 ch = PyUnicode_READ(kind, src_data, i);
11276 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011277 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011278 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011279 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011280 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011281 goto overflow;
11282 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011283 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011284 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011285 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011286 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011287 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011288 goto overflow;
11289 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011290 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011291 if (ch == '\n' || ch == '\r')
11292 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011293 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011294 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011295 if (!found)
11296 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011297
Guido van Rossumd57fd912000-03-10 22:53:23 +000011298 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011299 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011300 if (!u)
11301 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011302 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011303
Antoine Pitroue71d5742011-10-04 15:55:09 +020011304 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011305
Antoine Pitroue71d5742011-10-04 15:55:09 +020011306 for (; i < src_len; i++) {
11307 ch = PyUnicode_READ(kind, src_data, i);
11308 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011309 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011310 incr = tabsize - (line_pos % tabsize);
11311 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011312 FILL(kind, dest_data, ' ', j, incr);
11313 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011314 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011315 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011316 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011317 line_pos++;
11318 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011319 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011320 if (ch == '\n' || ch == '\r')
11321 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011322 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011323 }
11324 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011325 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011326
Antoine Pitroue71d5742011-10-04 15:55:09 +020011327 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011328 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11329 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011330}
11331
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011332PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011333 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011334\n\
11335Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011336such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011337arguments start and end are interpreted as in slice notation.\n\
11338\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011339Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011340
11341static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011342unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011343{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011344 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011345 Py_ssize_t start;
11346 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011347 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011348
Jesus Ceaac451502011-04-20 17:09:23 +020011349 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11350 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011351 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011352
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011353 if (PyUnicode_READY(self) == -1)
11354 return NULL;
11355 if (PyUnicode_READY(substring) == -1)
11356 return NULL;
11357
Victor Stinner7931d9a2011-11-04 00:22:48 +010011358 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011359
11360 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011361
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011362 if (result == -2)
11363 return NULL;
11364
Christian Heimes217cfd12007-12-02 14:31:20 +000011365 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011366}
11367
11368static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011369unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011370{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011371 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
11372 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011373 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011374 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011375}
11376
Guido van Rossumc2504932007-09-18 19:42:40 +000011377/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011378 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011379static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011380unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011381{
Guido van Rossumc2504932007-09-18 19:42:40 +000011382 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010011383 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011384
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011385#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011386 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011387#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011388 if (_PyUnicode_HASH(self) != -1)
11389 return _PyUnicode_HASH(self);
11390 if (PyUnicode_READY(self) == -1)
11391 return -1;
11392 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011393 /*
11394 We make the hash of the empty string be 0, rather than using
11395 (prefix ^ suffix), since this slightly obfuscates the hash secret
11396 */
11397 if (len == 0) {
11398 _PyUnicode_HASH(self) = 0;
11399 return 0;
11400 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011401
11402 /* The hash function as a macro, gets expanded three times below. */
Georg Brandl2fb477c2012-02-21 00:33:36 +010011403#define HASH(P) \
11404 x ^= (Py_uhash_t) *P << 7; \
11405 while (--len >= 0) \
11406 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011407
Georg Brandl2fb477c2012-02-21 00:33:36 +010011408 x = (Py_uhash_t) _Py_HashSecret.prefix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011409 switch (PyUnicode_KIND(self)) {
11410 case PyUnicode_1BYTE_KIND: {
11411 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11412 HASH(c);
11413 break;
11414 }
11415 case PyUnicode_2BYTE_KIND: {
11416 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11417 HASH(s);
11418 break;
11419 }
11420 default: {
11421 Py_UCS4 *l;
11422 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11423 "Impossible switch case in unicode_hash");
11424 l = PyUnicode_4BYTE_DATA(self);
11425 HASH(l);
11426 break;
11427 }
11428 }
Georg Brandl2fb477c2012-02-21 00:33:36 +010011429 x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
11430 x ^= (Py_uhash_t) _Py_HashSecret.suffix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011431
Guido van Rossumc2504932007-09-18 19:42:40 +000011432 if (x == -1)
11433 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011434 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011435 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011436}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011437#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011438
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011439PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011440 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011441\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011442Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011443
11444static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011445unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011446{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011447 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011448 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011449 Py_ssize_t start;
11450 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011451
Jesus Ceaac451502011-04-20 17:09:23 +020011452 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11453 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011454 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011455
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011456 if (PyUnicode_READY(self) == -1)
11457 return NULL;
11458 if (PyUnicode_READY(substring) == -1)
11459 return NULL;
11460
Victor Stinner7931d9a2011-11-04 00:22:48 +010011461 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011462
11463 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011464
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011465 if (result == -2)
11466 return NULL;
11467
Guido van Rossumd57fd912000-03-10 22:53:23 +000011468 if (result < 0) {
11469 PyErr_SetString(PyExc_ValueError, "substring not found");
11470 return NULL;
11471 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011472
Christian Heimes217cfd12007-12-02 14:31:20 +000011473 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011474}
11475
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011476PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011477 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011478\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011479Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011480at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011481
11482static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011483unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011484{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011485 Py_ssize_t i, length;
11486 int kind;
11487 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011488 int cased;
11489
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011490 if (PyUnicode_READY(self) == -1)
11491 return NULL;
11492 length = PyUnicode_GET_LENGTH(self);
11493 kind = PyUnicode_KIND(self);
11494 data = PyUnicode_DATA(self);
11495
Guido van Rossumd57fd912000-03-10 22:53:23 +000011496 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011497 if (length == 1)
11498 return PyBool_FromLong(
11499 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011500
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011501 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011502 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011503 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011504
Guido van Rossumd57fd912000-03-10 22:53:23 +000011505 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011506 for (i = 0; i < length; i++) {
11507 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011508
Benjamin Peterson29060642009-01-31 22:14:21 +000011509 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11510 return PyBool_FromLong(0);
11511 else if (!cased && Py_UNICODE_ISLOWER(ch))
11512 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011513 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011514 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011515}
11516
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011517PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011518 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011519\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011520Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011521at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011522
11523static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011524unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011525{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011526 Py_ssize_t i, length;
11527 int kind;
11528 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011529 int cased;
11530
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011531 if (PyUnicode_READY(self) == -1)
11532 return NULL;
11533 length = PyUnicode_GET_LENGTH(self);
11534 kind = PyUnicode_KIND(self);
11535 data = PyUnicode_DATA(self);
11536
Guido van Rossumd57fd912000-03-10 22:53:23 +000011537 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011538 if (length == 1)
11539 return PyBool_FromLong(
11540 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011541
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011542 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011543 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011544 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011545
Guido van Rossumd57fd912000-03-10 22:53:23 +000011546 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011547 for (i = 0; i < length; i++) {
11548 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011549
Benjamin Peterson29060642009-01-31 22:14:21 +000011550 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11551 return PyBool_FromLong(0);
11552 else if (!cased && Py_UNICODE_ISUPPER(ch))
11553 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011554 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011555 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011556}
11557
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011558PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011559 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011560\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011561Return True if S is a titlecased string and there is at least one\n\
11562character in S, i.e. upper- and titlecase characters may only\n\
11563follow uncased characters and lowercase characters only cased ones.\n\
11564Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011565
11566static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011567unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011568{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011569 Py_ssize_t i, length;
11570 int kind;
11571 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011572 int cased, previous_is_cased;
11573
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011574 if (PyUnicode_READY(self) == -1)
11575 return NULL;
11576 length = PyUnicode_GET_LENGTH(self);
11577 kind = PyUnicode_KIND(self);
11578 data = PyUnicode_DATA(self);
11579
Guido van Rossumd57fd912000-03-10 22:53:23 +000011580 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011581 if (length == 1) {
11582 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11583 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11584 (Py_UNICODE_ISUPPER(ch) != 0));
11585 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011586
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011587 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011588 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011589 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011590
Guido van Rossumd57fd912000-03-10 22:53:23 +000011591 cased = 0;
11592 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011593 for (i = 0; i < length; i++) {
11594 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011595
Benjamin Peterson29060642009-01-31 22:14:21 +000011596 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11597 if (previous_is_cased)
11598 return PyBool_FromLong(0);
11599 previous_is_cased = 1;
11600 cased = 1;
11601 }
11602 else if (Py_UNICODE_ISLOWER(ch)) {
11603 if (!previous_is_cased)
11604 return PyBool_FromLong(0);
11605 previous_is_cased = 1;
11606 cased = 1;
11607 }
11608 else
11609 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011610 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011611 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011612}
11613
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011614PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011615 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011616\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011617Return True if all characters in S are whitespace\n\
11618and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011619
11620static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011621unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011622{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011623 Py_ssize_t i, length;
11624 int kind;
11625 void *data;
11626
11627 if (PyUnicode_READY(self) == -1)
11628 return NULL;
11629 length = PyUnicode_GET_LENGTH(self);
11630 kind = PyUnicode_KIND(self);
11631 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011632
Guido van Rossumd57fd912000-03-10 22:53:23 +000011633 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011634 if (length == 1)
11635 return PyBool_FromLong(
11636 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011637
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011638 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011639 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011640 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011641
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011642 for (i = 0; i < length; i++) {
11643 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011644 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011645 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011646 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011647 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011648}
11649
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011650PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011651 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011652\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011653Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011654and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011655
11656static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011657unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011658{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011659 Py_ssize_t i, length;
11660 int kind;
11661 void *data;
11662
11663 if (PyUnicode_READY(self) == -1)
11664 return NULL;
11665 length = PyUnicode_GET_LENGTH(self);
11666 kind = PyUnicode_KIND(self);
11667 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011668
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011669 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011670 if (length == 1)
11671 return PyBool_FromLong(
11672 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011673
11674 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011675 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011676 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011677
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011678 for (i = 0; i < length; i++) {
11679 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011680 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011681 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011682 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011683}
11684
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011685PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011686 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011687\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011688Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011689and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011690
11691static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011692unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011693{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011694 int kind;
11695 void *data;
11696 Py_ssize_t len, i;
11697
11698 if (PyUnicode_READY(self) == -1)
11699 return NULL;
11700
11701 kind = PyUnicode_KIND(self);
11702 data = PyUnicode_DATA(self);
11703 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011704
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011705 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011706 if (len == 1) {
11707 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11708 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11709 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011710
11711 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011712 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011713 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011714
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011715 for (i = 0; i < len; i++) {
11716 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011717 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011718 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011719 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011720 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011721}
11722
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011723PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011724 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011725\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011726Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011727False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011728
11729static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011730unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011731{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011732 Py_ssize_t i, length;
11733 int kind;
11734 void *data;
11735
11736 if (PyUnicode_READY(self) == -1)
11737 return NULL;
11738 length = PyUnicode_GET_LENGTH(self);
11739 kind = PyUnicode_KIND(self);
11740 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011741
Guido van Rossumd57fd912000-03-10 22:53:23 +000011742 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011743 if (length == 1)
11744 return PyBool_FromLong(
11745 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011746
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011747 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011748 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011749 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011750
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011751 for (i = 0; i < length; i++) {
11752 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011753 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011754 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011755 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011756}
11757
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011758PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011759 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011760\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011761Return True if all characters in S are digits\n\
11762and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011763
11764static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011765unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011766{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011767 Py_ssize_t i, length;
11768 int kind;
11769 void *data;
11770
11771 if (PyUnicode_READY(self) == -1)
11772 return NULL;
11773 length = PyUnicode_GET_LENGTH(self);
11774 kind = PyUnicode_KIND(self);
11775 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011776
Guido van Rossumd57fd912000-03-10 22:53:23 +000011777 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011778 if (length == 1) {
11779 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11780 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11781 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011782
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011783 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011784 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011785 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011786
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011787 for (i = 0; i < length; i++) {
11788 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011789 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011790 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011791 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011792}
11793
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011794PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011795 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011796\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011797Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011798False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011799
11800static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011801unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011802{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011803 Py_ssize_t i, length;
11804 int kind;
11805 void *data;
11806
11807 if (PyUnicode_READY(self) == -1)
11808 return NULL;
11809 length = PyUnicode_GET_LENGTH(self);
11810 kind = PyUnicode_KIND(self);
11811 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011812
Guido van Rossumd57fd912000-03-10 22:53:23 +000011813 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011814 if (length == 1)
11815 return PyBool_FromLong(
11816 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011817
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011818 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011819 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011820 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011821
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011822 for (i = 0; i < length; i++) {
11823 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011824 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011825 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011826 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011827}
11828
Martin v. Löwis47383402007-08-15 07:32:56 +000011829int
11830PyUnicode_IsIdentifier(PyObject *self)
11831{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011832 int kind;
11833 void *data;
11834 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011835 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011836
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011837 if (PyUnicode_READY(self) == -1) {
11838 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011839 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011840 }
11841
11842 /* Special case for empty strings */
11843 if (PyUnicode_GET_LENGTH(self) == 0)
11844 return 0;
11845 kind = PyUnicode_KIND(self);
11846 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011847
11848 /* PEP 3131 says that the first character must be in
11849 XID_Start and subsequent characters in XID_Continue,
11850 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011851 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011852 letters, digits, underscore). However, given the current
11853 definition of XID_Start and XID_Continue, it is sufficient
11854 to check just for these, except that _ must be allowed
11855 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011856 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011857 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011858 return 0;
11859
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011860 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011861 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011862 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011863 return 1;
11864}
11865
11866PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011867 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011868\n\
11869Return True if S is a valid identifier according\n\
11870to the language definition.");
11871
11872static PyObject*
11873unicode_isidentifier(PyObject *self)
11874{
11875 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11876}
11877
Georg Brandl559e5d72008-06-11 18:37:52 +000011878PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011879 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011880\n\
11881Return True if all characters in S are considered\n\
11882printable in repr() or S is empty, False otherwise.");
11883
11884static PyObject*
11885unicode_isprintable(PyObject *self)
11886{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011887 Py_ssize_t i, length;
11888 int kind;
11889 void *data;
11890
11891 if (PyUnicode_READY(self) == -1)
11892 return NULL;
11893 length = PyUnicode_GET_LENGTH(self);
11894 kind = PyUnicode_KIND(self);
11895 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011896
11897 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011898 if (length == 1)
11899 return PyBool_FromLong(
11900 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011901
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011902 for (i = 0; i < length; i++) {
11903 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011904 Py_RETURN_FALSE;
11905 }
11906 }
11907 Py_RETURN_TRUE;
11908}
11909
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011910PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011911 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011912\n\
11913Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011914iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011915
11916static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011917unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011918{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011919 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011920}
11921
Martin v. Löwis18e16552006-02-15 17:27:45 +000011922static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011923unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011924{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011925 if (PyUnicode_READY(self) == -1)
11926 return -1;
11927 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011928}
11929
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011930PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011931 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011932\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011933Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011934done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011935
11936static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011937unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011938{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011939 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011940 Py_UCS4 fillchar = ' ';
11941
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011942 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011943 return NULL;
11944
Benjamin Petersonbac79492012-01-14 13:34:47 -050011945 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011946 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011947
Victor Stinnerc4b49542011-12-11 22:44:26 +010011948 if (PyUnicode_GET_LENGTH(self) >= width)
11949 return unicode_result_unchanged(self);
11950
11951 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011952}
11953
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011954PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011955 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011956\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011957Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011958
11959static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011960unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011961{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011962 if (PyUnicode_READY(self) == -1)
11963 return NULL;
11964 if (PyUnicode_IS_ASCII(self))
11965 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011966 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011967}
11968
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011969#define LEFTSTRIP 0
11970#define RIGHTSTRIP 1
11971#define BOTHSTRIP 2
11972
11973/* Arrays indexed by above */
11974static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11975
11976#define STRIPNAME(i) (stripformat[i]+3)
11977
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011978/* externally visible for str.strip(unicode) */
11979PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011980_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011981{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011982 void *data;
11983 int kind;
11984 Py_ssize_t i, j, len;
11985 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011986
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011987 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11988 return NULL;
11989
11990 kind = PyUnicode_KIND(self);
11991 data = PyUnicode_DATA(self);
11992 len = PyUnicode_GET_LENGTH(self);
11993 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11994 PyUnicode_DATA(sepobj),
11995 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011996
Benjamin Peterson14339b62009-01-31 16:36:08 +000011997 i = 0;
11998 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011999 while (i < len &&
12000 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012001 i++;
12002 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012003 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012004
Benjamin Peterson14339b62009-01-31 16:36:08 +000012005 j = len;
12006 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012007 do {
12008 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012009 } while (j >= i &&
12010 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000012011 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012012 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012013
Victor Stinner7931d9a2011-11-04 00:22:48 +010012014 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012015}
12016
12017PyObject*
12018PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12019{
12020 unsigned char *data;
12021 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012022 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012023
Victor Stinnerde636f32011-10-01 03:55:54 +020012024 if (PyUnicode_READY(self) == -1)
12025 return NULL;
12026
12027 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
12028
Victor Stinner12bab6d2011-10-01 01:53:49 +020012029 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Victor Stinnerc4b49542011-12-11 22:44:26 +010012030 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012031
Victor Stinner12bab6d2011-10-01 01:53:49 +020012032 length = end - start;
12033 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020012034 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012035
Victor Stinnerde636f32011-10-01 03:55:54 +020012036 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012037 PyErr_SetString(PyExc_IndexError, "string index out of range");
12038 return NULL;
12039 }
12040
Victor Stinnerb9275c12011-10-05 14:01:42 +020012041 if (PyUnicode_IS_ASCII(self)) {
12042 kind = PyUnicode_KIND(self);
12043 data = PyUnicode_1BYTE_DATA(self);
12044 return unicode_fromascii(data + start, length);
12045 }
12046 else {
12047 kind = PyUnicode_KIND(self);
12048 data = PyUnicode_1BYTE_DATA(self);
12049 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012050 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012051 length);
12052 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012053}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012054
12055static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012056do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012057{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012058 int kind;
12059 void *data;
12060 Py_ssize_t len, i, j;
12061
12062 if (PyUnicode_READY(self) == -1)
12063 return NULL;
12064
12065 kind = PyUnicode_KIND(self);
12066 data = PyUnicode_DATA(self);
12067 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012068
Benjamin Peterson14339b62009-01-31 16:36:08 +000012069 i = 0;
12070 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012071 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012072 i++;
12073 }
12074 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012075
Benjamin Peterson14339b62009-01-31 16:36:08 +000012076 j = len;
12077 if (striptype != LEFTSTRIP) {
12078 do {
12079 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012080 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012081 j++;
12082 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012083
Victor Stinner7931d9a2011-11-04 00:22:48 +010012084 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012085}
12086
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012087
12088static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012089do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012090{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012091 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012092
Benjamin Peterson14339b62009-01-31 16:36:08 +000012093 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
12094 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012095
Benjamin Peterson14339b62009-01-31 16:36:08 +000012096 if (sep != NULL && sep != Py_None) {
12097 if (PyUnicode_Check(sep))
12098 return _PyUnicode_XStrip(self, striptype, sep);
12099 else {
12100 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012101 "%s arg must be None or str",
12102 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012103 return NULL;
12104 }
12105 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012106
Benjamin Peterson14339b62009-01-31 16:36:08 +000012107 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012108}
12109
12110
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012111PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012112 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012113\n\
12114Return a copy of the string S with leading and trailing\n\
12115whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012116If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012117
12118static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012119unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012120{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012121 if (PyTuple_GET_SIZE(args) == 0)
12122 return do_strip(self, BOTHSTRIP); /* Common case */
12123 else
12124 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012125}
12126
12127
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012128PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012129 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012130\n\
12131Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012132If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012133
12134static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012135unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012136{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012137 if (PyTuple_GET_SIZE(args) == 0)
12138 return do_strip(self, LEFTSTRIP); /* Common case */
12139 else
12140 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012141}
12142
12143
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012144PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012145 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012146\n\
12147Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012148If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012149
12150static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012151unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012152{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012153 if (PyTuple_GET_SIZE(args) == 0)
12154 return do_strip(self, RIGHTSTRIP); /* Common case */
12155 else
12156 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012157}
12158
12159
Guido van Rossumd57fd912000-03-10 22:53:23 +000012160static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012161unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012162{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012163 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012164 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012165
Georg Brandl222de0f2009-04-12 12:01:50 +000012166 if (len < 1) {
12167 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020012168 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000012169 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012170
Victor Stinnerc4b49542011-12-11 22:44:26 +010012171 /* no repeat, return original string */
12172 if (len == 1)
12173 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012174
Benjamin Petersonbac79492012-01-14 13:34:47 -050012175 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012176 return NULL;
12177
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012178 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012179 PyErr_SetString(PyExc_OverflowError,
12180 "repeated string is too long");
12181 return NULL;
12182 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012183 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012184
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012185 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012186 if (!u)
12187 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012188 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012189
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012190 if (PyUnicode_GET_LENGTH(str) == 1) {
12191 const int kind = PyUnicode_KIND(str);
12192 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012193 if (kind == PyUnicode_1BYTE_KIND) {
12194 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012195 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012196 }
12197 else if (kind == PyUnicode_2BYTE_KIND) {
12198 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012199 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012200 ucs2[n] = fill_char;
12201 } else {
12202 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12203 assert(kind == PyUnicode_4BYTE_KIND);
12204 for (n = 0; n < len; ++n)
12205 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012206 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012207 }
12208 else {
12209 /* number of characters copied this far */
12210 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012211 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012212 char *to = (char *) PyUnicode_DATA(u);
12213 Py_MEMCPY(to, PyUnicode_DATA(str),
12214 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012215 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012216 n = (done <= nchars-done) ? done : nchars-done;
12217 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012218 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012219 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012220 }
12221
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012222 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012223 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012224}
12225
Alexander Belopolsky40018472011-02-26 01:02:56 +000012226PyObject *
12227PyUnicode_Replace(PyObject *obj,
12228 PyObject *subobj,
12229 PyObject *replobj,
12230 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012231{
12232 PyObject *self;
12233 PyObject *str1;
12234 PyObject *str2;
12235 PyObject *result;
12236
12237 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012238 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012239 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012240 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012241 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012242 Py_DECREF(self);
12243 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012244 }
12245 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012246 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012247 Py_DECREF(self);
12248 Py_DECREF(str1);
12249 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012250 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012251 if (PyUnicode_READY(self) == -1 ||
12252 PyUnicode_READY(str1) == -1 ||
12253 PyUnicode_READY(str2) == -1)
12254 result = NULL;
12255 else
12256 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012257 Py_DECREF(self);
12258 Py_DECREF(str1);
12259 Py_DECREF(str2);
12260 return result;
12261}
12262
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012263PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012264 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012265\n\
12266Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012267old replaced by new. If the optional argument count is\n\
12268given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012269
12270static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012271unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012272{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012273 PyObject *str1;
12274 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012275 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012276 PyObject *result;
12277
Martin v. Löwis18e16552006-02-15 17:27:45 +000012278 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012279 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012280 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012281 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012282 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012283 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012284 return NULL;
12285 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012286 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012287 Py_DECREF(str1);
12288 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000012289 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012290 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
12291 result = NULL;
12292 else
12293 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012294
12295 Py_DECREF(str1);
12296 Py_DECREF(str2);
12297 return result;
12298}
12299
Alexander Belopolsky40018472011-02-26 01:02:56 +000012300static PyObject *
12301unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012302{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012303 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012304 Py_ssize_t isize;
12305 Py_ssize_t osize, squote, dquote, i, o;
12306 Py_UCS4 max, quote;
12307 int ikind, okind;
12308 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012309
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012310 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012311 return NULL;
12312
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012313 isize = PyUnicode_GET_LENGTH(unicode);
12314 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012315
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012316 /* Compute length of output, quote characters, and
12317 maximum character */
12318 osize = 2; /* quotes */
12319 max = 127;
12320 squote = dquote = 0;
12321 ikind = PyUnicode_KIND(unicode);
12322 for (i = 0; i < isize; i++) {
12323 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12324 switch (ch) {
12325 case '\'': squote++; osize++; break;
12326 case '"': dquote++; osize++; break;
12327 case '\\': case '\t': case '\r': case '\n':
12328 osize += 2; break;
12329 default:
12330 /* Fast-path ASCII */
12331 if (ch < ' ' || ch == 0x7f)
12332 osize += 4; /* \xHH */
12333 else if (ch < 0x7f)
12334 osize++;
12335 else if (Py_UNICODE_ISPRINTABLE(ch)) {
12336 osize++;
12337 max = ch > max ? ch : max;
12338 }
12339 else if (ch < 0x100)
12340 osize += 4; /* \xHH */
12341 else if (ch < 0x10000)
12342 osize += 6; /* \uHHHH */
12343 else
12344 osize += 10; /* \uHHHHHHHH */
12345 }
12346 }
12347
12348 quote = '\'';
12349 if (squote) {
12350 if (dquote)
12351 /* Both squote and dquote present. Use squote,
12352 and escape them */
12353 osize += squote;
12354 else
12355 quote = '"';
12356 }
12357
12358 repr = PyUnicode_New(osize, max);
12359 if (repr == NULL)
12360 return NULL;
12361 okind = PyUnicode_KIND(repr);
12362 odata = PyUnicode_DATA(repr);
12363
12364 PyUnicode_WRITE(okind, odata, 0, quote);
12365 PyUnicode_WRITE(okind, odata, osize-1, quote);
12366
12367 for (i = 0, o = 1; i < isize; i++) {
12368 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012369
12370 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012371 if ((ch == quote) || (ch == '\\')) {
12372 PyUnicode_WRITE(okind, odata, o++, '\\');
12373 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012374 continue;
12375 }
12376
Benjamin Peterson29060642009-01-31 22:14:21 +000012377 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012378 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012379 PyUnicode_WRITE(okind, odata, o++, '\\');
12380 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012381 }
12382 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012383 PyUnicode_WRITE(okind, odata, o++, '\\');
12384 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012385 }
12386 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012387 PyUnicode_WRITE(okind, odata, o++, '\\');
12388 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012389 }
12390
12391 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012392 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012393 PyUnicode_WRITE(okind, odata, o++, '\\');
12394 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012395 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12396 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012397 }
12398
Georg Brandl559e5d72008-06-11 18:37:52 +000012399 /* Copy ASCII characters as-is */
12400 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012401 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012402 }
12403
Benjamin Peterson29060642009-01-31 22:14:21 +000012404 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000012405 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012406 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000012407 (categories Z* and C* except ASCII space)
12408 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012409 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012410 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012411 if (ch <= 0xff) {
12412 PyUnicode_WRITE(okind, odata, o++, '\\');
12413 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012414 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12415 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012416 }
12417 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012418 else if (ch >= 0x10000) {
12419 PyUnicode_WRITE(okind, odata, o++, '\\');
12420 PyUnicode_WRITE(okind, odata, o++, 'U');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012421 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12422 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12423 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12424 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12425 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12426 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12427 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12428 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012429 }
12430 /* Map 16-bit characters to '\uxxxx' */
12431 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012432 PyUnicode_WRITE(okind, odata, o++, '\\');
12433 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012434 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12435 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12436 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12437 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012438 }
12439 }
12440 /* Copy characters as-is */
12441 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012442 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012443 }
12444 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012445 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012446 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012447 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012448 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012449}
12450
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012451PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012452 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012453\n\
12454Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012455such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012456arguments start and end are interpreted as in slice notation.\n\
12457\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012458Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012459
12460static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012461unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012462{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012463 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012464 Py_ssize_t start;
12465 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012466 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012467
Jesus Ceaac451502011-04-20 17:09:23 +020012468 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12469 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012470 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012471
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012472 if (PyUnicode_READY(self) == -1)
12473 return NULL;
12474 if (PyUnicode_READY(substring) == -1)
12475 return NULL;
12476
Victor Stinner7931d9a2011-11-04 00:22:48 +010012477 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012478
12479 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012480
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012481 if (result == -2)
12482 return NULL;
12483
Christian Heimes217cfd12007-12-02 14:31:20 +000012484 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012485}
12486
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012487PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012488 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012489\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012490Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012491
12492static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012493unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012494{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012495 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012496 Py_ssize_t start;
12497 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012498 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012499
Jesus Ceaac451502011-04-20 17:09:23 +020012500 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12501 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012502 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012503
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012504 if (PyUnicode_READY(self) == -1)
12505 return NULL;
12506 if (PyUnicode_READY(substring) == -1)
12507 return NULL;
12508
Victor Stinner7931d9a2011-11-04 00:22:48 +010012509 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012510
12511 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012512
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012513 if (result == -2)
12514 return NULL;
12515
Guido van Rossumd57fd912000-03-10 22:53:23 +000012516 if (result < 0) {
12517 PyErr_SetString(PyExc_ValueError, "substring not found");
12518 return NULL;
12519 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012520
Christian Heimes217cfd12007-12-02 14:31:20 +000012521 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012522}
12523
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012524PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012525 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012526\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012527Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012528done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012529
12530static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012531unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012532{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012533 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012534 Py_UCS4 fillchar = ' ';
12535
Victor Stinnere9a29352011-10-01 02:14:59 +020012536 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012537 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012538
Benjamin Petersonbac79492012-01-14 13:34:47 -050012539 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012540 return NULL;
12541
Victor Stinnerc4b49542011-12-11 22:44:26 +010012542 if (PyUnicode_GET_LENGTH(self) >= width)
12543 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012544
Victor Stinnerc4b49542011-12-11 22:44:26 +010012545 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012546}
12547
Alexander Belopolsky40018472011-02-26 01:02:56 +000012548PyObject *
12549PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012550{
12551 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012552
Guido van Rossumd57fd912000-03-10 22:53:23 +000012553 s = PyUnicode_FromObject(s);
12554 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012555 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012556 if (sep != NULL) {
12557 sep = PyUnicode_FromObject(sep);
12558 if (sep == NULL) {
12559 Py_DECREF(s);
12560 return NULL;
12561 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012562 }
12563
Victor Stinner9310abb2011-10-05 00:59:23 +020012564 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012565
12566 Py_DECREF(s);
12567 Py_XDECREF(sep);
12568 return result;
12569}
12570
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012571PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012572 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012573\n\
12574Return a list of the words in S, using sep as the\n\
12575delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012576splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012577whitespace string is a separator and empty strings are\n\
12578removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012579
12580static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012581unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012582{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012583 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012584 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012585 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012586
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012587 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12588 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012589 return NULL;
12590
12591 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012592 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012593 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012594 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012595 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012596 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012597}
12598
Thomas Wouters477c8d52006-05-27 19:21:47 +000012599PyObject *
12600PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12601{
12602 PyObject* str_obj;
12603 PyObject* sep_obj;
12604 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012605 int kind1, kind2, kind;
12606 void *buf1 = NULL, *buf2 = NULL;
12607 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012608
12609 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012610 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012611 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012612 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012613 if (!sep_obj) {
12614 Py_DECREF(str_obj);
12615 return NULL;
12616 }
12617 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12618 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012619 Py_DECREF(str_obj);
12620 return NULL;
12621 }
12622
Victor Stinner14f8f022011-10-05 20:58:25 +020012623 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012624 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012625 kind = Py_MAX(kind1, kind2);
12626 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012627 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012628 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012629 if (!buf1)
12630 goto onError;
12631 buf2 = PyUnicode_DATA(sep_obj);
12632 if (kind2 != kind)
12633 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12634 if (!buf2)
12635 goto onError;
12636 len1 = PyUnicode_GET_LENGTH(str_obj);
12637 len2 = PyUnicode_GET_LENGTH(sep_obj);
12638
Benjamin Petersonead6b532011-12-20 17:23:42 -060012639 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012640 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012641 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12642 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12643 else
12644 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012645 break;
12646 case PyUnicode_2BYTE_KIND:
12647 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12648 break;
12649 case PyUnicode_4BYTE_KIND:
12650 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12651 break;
12652 default:
12653 assert(0);
12654 out = 0;
12655 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012656
12657 Py_DECREF(sep_obj);
12658 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012659 if (kind1 != kind)
12660 PyMem_Free(buf1);
12661 if (kind2 != kind)
12662 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012663
12664 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012665 onError:
12666 Py_DECREF(sep_obj);
12667 Py_DECREF(str_obj);
12668 if (kind1 != kind && buf1)
12669 PyMem_Free(buf1);
12670 if (kind2 != kind && buf2)
12671 PyMem_Free(buf2);
12672 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012673}
12674
12675
12676PyObject *
12677PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12678{
12679 PyObject* str_obj;
12680 PyObject* sep_obj;
12681 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012682 int kind1, kind2, kind;
12683 void *buf1 = NULL, *buf2 = NULL;
12684 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012685
12686 str_obj = PyUnicode_FromObject(str_in);
12687 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012688 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012689 sep_obj = PyUnicode_FromObject(sep_in);
12690 if (!sep_obj) {
12691 Py_DECREF(str_obj);
12692 return NULL;
12693 }
12694
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012695 kind1 = PyUnicode_KIND(str_in);
12696 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012697 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012698 buf1 = PyUnicode_DATA(str_in);
12699 if (kind1 != kind)
12700 buf1 = _PyUnicode_AsKind(str_in, kind);
12701 if (!buf1)
12702 goto onError;
12703 buf2 = PyUnicode_DATA(sep_obj);
12704 if (kind2 != kind)
12705 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12706 if (!buf2)
12707 goto onError;
12708 len1 = PyUnicode_GET_LENGTH(str_obj);
12709 len2 = PyUnicode_GET_LENGTH(sep_obj);
12710
Benjamin Petersonead6b532011-12-20 17:23:42 -060012711 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012712 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012713 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12714 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12715 else
12716 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012717 break;
12718 case PyUnicode_2BYTE_KIND:
12719 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12720 break;
12721 case PyUnicode_4BYTE_KIND:
12722 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12723 break;
12724 default:
12725 assert(0);
12726 out = 0;
12727 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012728
12729 Py_DECREF(sep_obj);
12730 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012731 if (kind1 != kind)
12732 PyMem_Free(buf1);
12733 if (kind2 != kind)
12734 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012735
12736 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012737 onError:
12738 Py_DECREF(sep_obj);
12739 Py_DECREF(str_obj);
12740 if (kind1 != kind && buf1)
12741 PyMem_Free(buf1);
12742 if (kind2 != kind && buf2)
12743 PyMem_Free(buf2);
12744 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012745}
12746
12747PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012748 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012749\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012750Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012751the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012752found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012753
12754static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012755unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012756{
Victor Stinner9310abb2011-10-05 00:59:23 +020012757 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012758}
12759
12760PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012761 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012762\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012763Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012764the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012765separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012766
12767static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012768unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012769{
Victor Stinner9310abb2011-10-05 00:59:23 +020012770 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012771}
12772
Alexander Belopolsky40018472011-02-26 01:02:56 +000012773PyObject *
12774PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012775{
12776 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012777
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012778 s = PyUnicode_FromObject(s);
12779 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012780 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012781 if (sep != NULL) {
12782 sep = PyUnicode_FromObject(sep);
12783 if (sep == NULL) {
12784 Py_DECREF(s);
12785 return NULL;
12786 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012787 }
12788
Victor Stinner9310abb2011-10-05 00:59:23 +020012789 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012790
12791 Py_DECREF(s);
12792 Py_XDECREF(sep);
12793 return result;
12794}
12795
12796PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012797 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012798\n\
12799Return a list of the words in S, using sep as the\n\
12800delimiter string, starting at the end of the string and\n\
12801working to the front. If maxsplit is given, at most maxsplit\n\
12802splits are done. If sep is not specified, any whitespace string\n\
12803is a separator.");
12804
12805static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012806unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012807{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012808 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012809 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012810 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012811
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012812 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12813 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012814 return NULL;
12815
12816 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012817 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012818 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012819 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012820 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012821 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012822}
12823
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012824PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012825 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012826\n\
12827Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012828Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012829is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012830
12831static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012832unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012833{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012834 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012835 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012836
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012837 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12838 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012839 return NULL;
12840
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012841 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012842}
12843
12844static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012845PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012846{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012847 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012848}
12849
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012850PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012851 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012852\n\
12853Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012854and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012855
12856static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012857unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012858{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012859 if (PyUnicode_READY(self) == -1)
12860 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012861 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012862}
12863
Georg Brandlceee0772007-11-27 23:48:05 +000012864PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012865 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012866\n\
12867Return a translation table usable for str.translate().\n\
12868If there is only one argument, it must be a dictionary mapping Unicode\n\
12869ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012870Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012871If there are two arguments, they must be strings of equal length, and\n\
12872in the resulting dictionary, each character in x will be mapped to the\n\
12873character at the same position in y. If there is a third argument, it\n\
12874must be a string, whose characters will be mapped to None in the result.");
12875
12876static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012877unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012878{
12879 PyObject *x, *y = NULL, *z = NULL;
12880 PyObject *new = NULL, *key, *value;
12881 Py_ssize_t i = 0;
12882 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012883
Georg Brandlceee0772007-11-27 23:48:05 +000012884 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12885 return NULL;
12886 new = PyDict_New();
12887 if (!new)
12888 return NULL;
12889 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012890 int x_kind, y_kind, z_kind;
12891 void *x_data, *y_data, *z_data;
12892
Georg Brandlceee0772007-11-27 23:48:05 +000012893 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012894 if (!PyUnicode_Check(x)) {
12895 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12896 "be a string if there is a second argument");
12897 goto err;
12898 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012899 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012900 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12901 "arguments must have equal length");
12902 goto err;
12903 }
12904 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012905 x_kind = PyUnicode_KIND(x);
12906 y_kind = PyUnicode_KIND(y);
12907 x_data = PyUnicode_DATA(x);
12908 y_data = PyUnicode_DATA(y);
12909 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12910 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012911 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012912 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012913 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012914 if (!value) {
12915 Py_DECREF(key);
12916 goto err;
12917 }
Georg Brandlceee0772007-11-27 23:48:05 +000012918 res = PyDict_SetItem(new, key, value);
12919 Py_DECREF(key);
12920 Py_DECREF(value);
12921 if (res < 0)
12922 goto err;
12923 }
12924 /* create entries for deleting chars in z */
12925 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012926 z_kind = PyUnicode_KIND(z);
12927 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012928 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012929 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012930 if (!key)
12931 goto err;
12932 res = PyDict_SetItem(new, key, Py_None);
12933 Py_DECREF(key);
12934 if (res < 0)
12935 goto err;
12936 }
12937 }
12938 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012939 int kind;
12940 void *data;
12941
Georg Brandlceee0772007-11-27 23:48:05 +000012942 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012943 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012944 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12945 "to maketrans it must be a dict");
12946 goto err;
12947 }
12948 /* copy entries into the new dict, converting string keys to int keys */
12949 while (PyDict_Next(x, &i, &key, &value)) {
12950 if (PyUnicode_Check(key)) {
12951 /* convert string keys to integer keys */
12952 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012953 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012954 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12955 "table must be of length 1");
12956 goto err;
12957 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012958 kind = PyUnicode_KIND(key);
12959 data = PyUnicode_DATA(key);
12960 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012961 if (!newkey)
12962 goto err;
12963 res = PyDict_SetItem(new, newkey, value);
12964 Py_DECREF(newkey);
12965 if (res < 0)
12966 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012967 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012968 /* just keep integer keys */
12969 if (PyDict_SetItem(new, key, value) < 0)
12970 goto err;
12971 } else {
12972 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12973 "be strings or integers");
12974 goto err;
12975 }
12976 }
12977 }
12978 return new;
12979 err:
12980 Py_DECREF(new);
12981 return NULL;
12982}
12983
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012984PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012985 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012986\n\
12987Return a copy of the string S, where all characters have been mapped\n\
12988through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012989Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012990Unmapped characters are left untouched. Characters mapped to None\n\
12991are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012992
12993static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012994unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012995{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012996 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012997}
12998
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012999PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013000 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013001\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013002Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013003
13004static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013005unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013006{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013007 if (PyUnicode_READY(self) == -1)
13008 return NULL;
13009 if (PyUnicode_IS_ASCII(self))
13010 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013011 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013012}
13013
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013014PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013015 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013016\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000013017Pad a numeric string S with zeros on the left, to fill a field\n\
13018of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013019
13020static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020013021unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013022{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013023 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013024 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013025 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013026 int kind;
13027 void *data;
13028 Py_UCS4 chr;
13029
Martin v. Löwis18e16552006-02-15 17:27:45 +000013030 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000013031 return NULL;
13032
Benjamin Petersonbac79492012-01-14 13:34:47 -050013033 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013034 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013035
Victor Stinnerc4b49542011-12-11 22:44:26 +010013036 if (PyUnicode_GET_LENGTH(self) >= width)
13037 return unicode_result_unchanged(self);
13038
13039 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013040
13041 u = pad(self, fill, 0, '0');
13042
Walter Dörwald068325e2002-04-15 13:36:47 +000013043 if (u == NULL)
13044 return NULL;
13045
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013046 kind = PyUnicode_KIND(u);
13047 data = PyUnicode_DATA(u);
13048 chr = PyUnicode_READ(kind, data, fill);
13049
13050 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013051 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013052 PyUnicode_WRITE(kind, data, 0, chr);
13053 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013054 }
13055
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013056 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013057 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013058}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013059
13060#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013061static PyObject *
13062unicode__decimal2ascii(PyObject *self)
13063{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013064 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013065}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013066#endif
13067
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013068PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013069 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013070\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013071Return True if S starts with the specified prefix, False otherwise.\n\
13072With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013073With optional end, stop comparing S at that position.\n\
13074prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013075
13076static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013077unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013078 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013079{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013080 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013081 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013082 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013083 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013084 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013085
Jesus Ceaac451502011-04-20 17:09:23 +020013086 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013087 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013088 if (PyTuple_Check(subobj)) {
13089 Py_ssize_t i;
13090 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013091 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013092 if (substring == NULL)
13093 return NULL;
13094 result = tailmatch(self, substring, start, end, -1);
13095 Py_DECREF(substring);
13096 if (result) {
13097 Py_RETURN_TRUE;
13098 }
13099 }
13100 /* nothing matched */
13101 Py_RETURN_FALSE;
13102 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013103 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013104 if (substring == NULL) {
13105 if (PyErr_ExceptionMatches(PyExc_TypeError))
13106 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
13107 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013108 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013109 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013110 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013111 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013112 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013113}
13114
13115
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013116PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013117 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013118\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013119Return True if S ends with the specified suffix, False otherwise.\n\
13120With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013121With optional end, stop comparing S at that position.\n\
13122suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013123
13124static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013125unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013126 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013127{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013128 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013129 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013130 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013131 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013132 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013133
Jesus Ceaac451502011-04-20 17:09:23 +020013134 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013135 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013136 if (PyTuple_Check(subobj)) {
13137 Py_ssize_t i;
13138 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013139 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000013140 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013141 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013142 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013143 result = tailmatch(self, substring, start, end, +1);
13144 Py_DECREF(substring);
13145 if (result) {
13146 Py_RETURN_TRUE;
13147 }
13148 }
13149 Py_RETURN_FALSE;
13150 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013151 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013152 if (substring == NULL) {
13153 if (PyErr_ExceptionMatches(PyExc_TypeError))
13154 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
13155 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013156 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013157 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013158 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013159 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013160 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013161}
13162
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013163#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013164
13165PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013166 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013167\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013168Return a formatted version of S, using substitutions from args and kwargs.\n\
13169The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013170
Eric Smith27bbca62010-11-04 17:06:58 +000013171PyDoc_STRVAR(format_map__doc__,
13172 "S.format_map(mapping) -> str\n\
13173\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013174Return a formatted version of S, using substitutions from mapping.\n\
13175The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013176
Eric Smith4a7d76d2008-05-30 18:10:19 +000013177static PyObject *
13178unicode__format__(PyObject* self, PyObject* args)
13179{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013180 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013181
13182 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13183 return NULL;
13184
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013185 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013186 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013187 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013188}
13189
Eric Smith8c663262007-08-25 02:26:07 +000013190PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013191 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013192\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013193Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013194
13195static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013196unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013197{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013198 Py_ssize_t size;
13199
13200 /* If it's a compact object, account for base structure +
13201 character data. */
13202 if (PyUnicode_IS_COMPACT_ASCII(v))
13203 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13204 else if (PyUnicode_IS_COMPACT(v))
13205 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013206 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013207 else {
13208 /* If it is a two-block object, account for base object, and
13209 for character block if present. */
13210 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013211 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013212 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013213 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013214 }
13215 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013216 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013217 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013218 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013219 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013220 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013221
13222 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013223}
13224
13225PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013226 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013227
13228static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013229unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013230{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013231 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013232 if (!copy)
13233 return NULL;
13234 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013235}
13236
Guido van Rossumd57fd912000-03-10 22:53:23 +000013237static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013238 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013239 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013240 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13241 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013242 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13243 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013244 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013245 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13246 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13247 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13248 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
13249 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013250 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013251 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13252 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13253 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013254 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013255 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13256 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13257 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013258 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013259 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010013260 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013261 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013262 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13263 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13264 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13265 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13266 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13267 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13268 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13269 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13270 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13271 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13272 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13273 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13274 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13275 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013276 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013277 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013278 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013279 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013280 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013281 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000013282 {"maketrans", (PyCFunction) unicode_maketrans,
13283 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013284 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013285#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013286 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013287 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013288#endif
13289
Benjamin Peterson14339b62009-01-31 16:36:08 +000013290 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013291 {NULL, NULL}
13292};
13293
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013294static PyObject *
13295unicode_mod(PyObject *v, PyObject *w)
13296{
Brian Curtindfc80e32011-08-10 20:28:54 -050013297 if (!PyUnicode_Check(v))
13298 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013299 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013300}
13301
13302static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013303 0, /*nb_add*/
13304 0, /*nb_subtract*/
13305 0, /*nb_multiply*/
13306 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013307};
13308
Guido van Rossumd57fd912000-03-10 22:53:23 +000013309static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013310 (lenfunc) unicode_length, /* sq_length */
13311 PyUnicode_Concat, /* sq_concat */
13312 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13313 (ssizeargfunc) unicode_getitem, /* sq_item */
13314 0, /* sq_slice */
13315 0, /* sq_ass_item */
13316 0, /* sq_ass_slice */
13317 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013318};
13319
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013320static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013321unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013322{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013323 if (PyUnicode_READY(self) == -1)
13324 return NULL;
13325
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013326 if (PyIndex_Check(item)) {
13327 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013328 if (i == -1 && PyErr_Occurred())
13329 return NULL;
13330 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013331 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013332 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013333 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013334 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013335 PyObject *result;
13336 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013337 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013338 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013339
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013340 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013341 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013342 return NULL;
13343 }
13344
13345 if (slicelength <= 0) {
Victor Stinner382955f2011-12-11 21:44:00 +010013346 Py_INCREF(unicode_empty);
13347 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013348 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013349 slicelength == PyUnicode_GET_LENGTH(self)) {
13350 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013351 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013352 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013353 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013354 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013355 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013356 src_kind = PyUnicode_KIND(self);
13357 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013358 if (!PyUnicode_IS_ASCII(self)) {
13359 kind_limit = kind_maxchar_limit(src_kind);
13360 max_char = 0;
13361 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13362 ch = PyUnicode_READ(src_kind, src_data, cur);
13363 if (ch > max_char) {
13364 max_char = ch;
13365 if (max_char >= kind_limit)
13366 break;
13367 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013368 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013369 }
Victor Stinner55c99112011-10-13 01:17:06 +020013370 else
13371 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013372 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013373 if (result == NULL)
13374 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013375 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013376 dest_data = PyUnicode_DATA(result);
13377
13378 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013379 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13380 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013381 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013382 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013383 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013384 } else {
13385 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13386 return NULL;
13387 }
13388}
13389
13390static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013391 (lenfunc)unicode_length, /* mp_length */
13392 (binaryfunc)unicode_subscript, /* mp_subscript */
13393 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013394};
13395
Guido van Rossumd57fd912000-03-10 22:53:23 +000013396
Guido van Rossumd57fd912000-03-10 22:53:23 +000013397/* Helpers for PyUnicode_Format() */
13398
13399static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000013400getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013401{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013402 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013403 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013404 (*p_argidx)++;
13405 if (arglen < 0)
13406 return args;
13407 else
13408 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013409 }
13410 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013411 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013412 return NULL;
13413}
13414
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013415/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013416
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013417static PyObject *
13418formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013419{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013420 char *p;
13421 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013422 double x;
Tim Petersced69f82003-09-16 20:30:58 +000013423
Guido van Rossumd57fd912000-03-10 22:53:23 +000013424 x = PyFloat_AsDouble(v);
13425 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013426 return NULL;
13427
Guido van Rossumd57fd912000-03-10 22:53:23 +000013428 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013429 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013430
Eric Smith0923d1d2009-04-16 20:16:10 +000013431 p = PyOS_double_to_string(x, type, prec,
13432 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013433 if (p == NULL)
13434 return NULL;
Victor Stinnerb5c3ea32012-05-02 00:29:36 +020013435 result = unicode_fromascii((unsigned char*)p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +000013436 PyMem_Free(p);
13437 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013438}
13439
Victor Stinnerd0880d52012-04-27 23:40:13 +020013440/* formatlong() emulates the format codes d, u, o, x and X, and
13441 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13442 * Python's regular ints.
13443 * Return value: a new PyUnicodeObject*, or NULL if error.
13444 * The output string is of the form
13445 * "-"? ("0x" | "0X")? digit+
13446 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13447 * set in flags. The case of hex digits will be correct,
13448 * There will be at least prec digits, zero-filled on the left if
13449 * necessary to get that many.
13450 * val object to be converted
13451 * flags bitmask of format flags; only F_ALT is looked at
13452 * prec minimum number of digits; 0-fill on left if needed
13453 * type a character in [duoxX]; u acts the same as d
13454 *
13455 * CAUTION: o, x and X conversions on regular ints can never
13456 * produce a '-' sign, but can for Python's unbounded ints.
13457 */
Tim Peters38fd5b62000-09-21 05:43:11 +000013458static PyObject*
13459formatlong(PyObject *val, int flags, int prec, int type)
13460{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013461 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013462 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013463 Py_ssize_t i;
13464 int sign; /* 1 if '-', else 0 */
13465 int len; /* number of characters */
13466 Py_ssize_t llen;
13467 int numdigits; /* len == numnondigits + numdigits */
13468 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000013469
Victor Stinnerd0880d52012-04-27 23:40:13 +020013470 /* Avoid exceeding SSIZE_T_MAX */
13471 if (prec > INT_MAX-3) {
13472 PyErr_SetString(PyExc_OverflowError,
13473 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013474 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013475 }
13476
13477 assert(PyLong_Check(val));
13478
13479 switch (type) {
13480 case 'd':
13481 case 'u':
13482 /* Special-case boolean: we want 0/1 */
Victor Stinnerb11d91d2012-04-28 00:25:34 +020013483 if (PyBool_Check(val))
13484 result = PyNumber_ToBase(val, 10);
13485 else
13486 result = Py_TYPE(val)->tp_str(val);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013487 break;
13488 case 'o':
13489 numnondigits = 2;
13490 result = PyNumber_ToBase(val, 8);
13491 break;
13492 case 'x':
13493 case 'X':
13494 numnondigits = 2;
13495 result = PyNumber_ToBase(val, 16);
13496 break;
13497 default:
13498 assert(!"'type' not in [duoxX]");
13499 }
13500 if (!result)
13501 return NULL;
13502
13503 assert(unicode_modifiable(result));
13504 assert(PyUnicode_IS_READY(result));
13505 assert(PyUnicode_IS_ASCII(result));
13506
13507 /* To modify the string in-place, there can only be one reference. */
13508 if (Py_REFCNT(result) != 1) {
13509 PyErr_BadInternalCall();
13510 return NULL;
13511 }
13512 buf = PyUnicode_DATA(result);
13513 llen = PyUnicode_GET_LENGTH(result);
13514 if (llen > INT_MAX) {
13515 PyErr_SetString(PyExc_ValueError,
13516 "string too large in _PyBytes_FormatLong");
13517 return NULL;
13518 }
13519 len = (int)llen;
13520 sign = buf[0] == '-';
13521 numnondigits += sign;
13522 numdigits = len - numnondigits;
13523 assert(numdigits > 0);
13524
13525 /* Get rid of base marker unless F_ALT */
13526 if (((flags & F_ALT) == 0 &&
13527 (type == 'o' || type == 'x' || type == 'X'))) {
13528 assert(buf[sign] == '0');
13529 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13530 buf[sign+1] == 'o');
13531 numnondigits -= 2;
13532 buf += 2;
13533 len -= 2;
13534 if (sign)
13535 buf[0] = '-';
13536 assert(len == numnondigits + numdigits);
13537 assert(numdigits > 0);
13538 }
13539
13540 /* Fill with leading zeroes to meet minimum width. */
13541 if (prec > numdigits) {
13542 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13543 numnondigits + prec);
13544 char *b1;
13545 if (!r1) {
13546 Py_DECREF(result);
13547 return NULL;
13548 }
13549 b1 = PyBytes_AS_STRING(r1);
13550 for (i = 0; i < numnondigits; ++i)
13551 *b1++ = *buf++;
13552 for (i = 0; i < prec - numdigits; i++)
13553 *b1++ = '0';
13554 for (i = 0; i < numdigits; i++)
13555 *b1++ = *buf++;
13556 *b1 = '\0';
13557 Py_DECREF(result);
13558 result = r1;
13559 buf = PyBytes_AS_STRING(result);
13560 len = numnondigits + prec;
13561 }
13562
13563 /* Fix up case for hex conversions. */
13564 if (type == 'X') {
13565 /* Need to convert all lower case letters to upper case.
13566 and need to convert 0x to 0X (and -0x to -0X). */
13567 for (i = 0; i < len; i++)
13568 if (buf[i] >= 'a' && buf[i] <= 'x')
13569 buf[i] -= 'a'-'A';
13570 }
13571 if (!PyUnicode_Check(result) || len != PyUnicode_GET_LENGTH(result)) {
13572 PyObject *unicode;
13573 unicode = unicode_fromascii((unsigned char *)buf, len);
13574 Py_DECREF(result);
13575 result = unicode;
13576 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013577 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013578}
13579
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013580static Py_UCS4
13581formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013582{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013583 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013584 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013585 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013586 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013587 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013588 goto onError;
13589 }
13590 else {
13591 /* Integer input truncated to a character */
13592 long x;
13593 x = PyLong_AsLong(v);
13594 if (x == -1 && PyErr_Occurred())
13595 goto onError;
13596
Victor Stinner8faf8212011-12-08 22:14:11 +010013597 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013598 PyErr_SetString(PyExc_OverflowError,
13599 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013600 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013601 }
13602
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013603 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013604 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013605
Benjamin Peterson29060642009-01-31 22:14:21 +000013606 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013607 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013608 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013609 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013610}
13611
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013612static int
13613repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count)
13614{
13615 int r;
13616 assert(count > 0);
13617 assert(PyUnicode_Check(obj));
13618 if (count > 5) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013619 PyObject *repeated = unicode_repeat(obj, count);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013620 if (repeated == NULL)
13621 return -1;
13622 r = _PyAccu_Accumulate(acc, repeated);
13623 Py_DECREF(repeated);
13624 return r;
13625 }
13626 else {
13627 do {
13628 if (_PyAccu_Accumulate(acc, obj))
13629 return -1;
13630 } while (--count);
13631 return 0;
13632 }
13633}
13634
Alexander Belopolsky40018472011-02-26 01:02:56 +000013635PyObject *
13636PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013637{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013638 void *fmt;
13639 int fmtkind;
13640 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013641 int kind;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013642 int r;
13643 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013644 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013645 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013646 PyObject *temp = NULL;
13647 PyObject *second = NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013648 PyObject *uformat;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013649 _PyAccu acc;
13650 static PyObject *plus, *minus, *blank, *zero, *percent;
13651
13652 if (!plus && !(plus = get_latin1_char('+')))
13653 return NULL;
13654 if (!minus && !(minus = get_latin1_char('-')))
13655 return NULL;
13656 if (!blank && !(blank = get_latin1_char(' ')))
13657 return NULL;
13658 if (!zero && !(zero = get_latin1_char('0')))
13659 return NULL;
13660 if (!percent && !(percent = get_latin1_char('%')))
13661 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000013662
Guido van Rossumd57fd912000-03-10 22:53:23 +000013663 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013664 PyErr_BadInternalCall();
13665 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013666 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013667 uformat = PyUnicode_FromObject(format);
Benjamin Peterson22a29702012-01-02 09:00:30 -060013668 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013669 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060013670 if (PyUnicode_READY(uformat) == -1)
13671 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013672 if (_PyAccu_Init(&acc))
13673 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013674 fmt = PyUnicode_DATA(uformat);
13675 fmtkind = PyUnicode_KIND(uformat);
13676 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13677 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013678
Guido van Rossumd57fd912000-03-10 22:53:23 +000013679 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013680 arglen = PyTuple_Size(args);
13681 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013682 }
13683 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013684 arglen = -1;
13685 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013686 }
Christian Heimes90aa7642007-12-19 02:45:37 +000013687 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000013688 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013689 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013690
13691 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013692 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013693 PyObject *nonfmt;
13694 Py_ssize_t nonfmtpos;
13695 nonfmtpos = fmtpos++;
13696 while (fmtcnt >= 0 &&
13697 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13698 fmtpos++;
13699 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013700 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013701 nonfmt = PyUnicode_Substring(uformat, nonfmtpos, fmtpos);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013702 if (nonfmt == NULL)
13703 goto onError;
13704 r = _PyAccu_Accumulate(&acc, nonfmt);
13705 Py_DECREF(nonfmt);
13706 if (r)
13707 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013708 }
13709 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013710 /* Got a format specifier */
13711 int flags = 0;
13712 Py_ssize_t width = -1;
13713 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013714 Py_UCS4 c = '\0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013715 Py_UCS4 fill, sign;
Benjamin Peterson29060642009-01-31 22:14:21 +000013716 int isnumok;
13717 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013718 void *pbuf = NULL;
13719 Py_ssize_t pindex, len;
13720 PyObject *signobj = NULL, *fillobj = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013721
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013722 fmtpos++;
Victor Stinner438106b2012-05-02 00:41:57 +020013723 c = PyUnicode_READ(fmtkind, fmt, fmtpos);
13724 if (c == '(') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013725 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013726 Py_ssize_t keylen;
13727 PyObject *key;
13728 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013729
Benjamin Peterson29060642009-01-31 22:14:21 +000013730 if (dict == NULL) {
13731 PyErr_SetString(PyExc_TypeError,
13732 "format requires a mapping");
13733 goto onError;
13734 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013735 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013736 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013737 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013738 /* Skip over balanced parentheses */
13739 while (pcount > 0 && --fmtcnt >= 0) {
Victor Stinnerbff7c962012-05-03 01:44:59 +020013740 c = PyUnicode_READ(fmtkind, fmt, fmtpos);
13741 if (c == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013742 --pcount;
Victor Stinnerbff7c962012-05-03 01:44:59 +020013743 else if (c == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013744 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013745 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013746 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013747 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013748 if (fmtcnt < 0 || pcount > 0) {
13749 PyErr_SetString(PyExc_ValueError,
13750 "incomplete format key");
13751 goto onError;
13752 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013753 key = PyUnicode_Substring(uformat,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013754 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013755 if (key == NULL)
13756 goto onError;
13757 if (args_owned) {
13758 Py_DECREF(args);
13759 args_owned = 0;
13760 }
13761 args = PyObject_GetItem(dict, key);
13762 Py_DECREF(key);
13763 if (args == NULL) {
13764 goto onError;
13765 }
13766 args_owned = 1;
13767 arglen = -1;
13768 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013769 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013770 while (--fmtcnt >= 0) {
Victor Stinner438106b2012-05-02 00:41:57 +020013771 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13772 switch (c) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013773 case '-': flags |= F_LJUST; continue;
13774 case '+': flags |= F_SIGN; continue;
13775 case ' ': flags |= F_BLANK; continue;
13776 case '#': flags |= F_ALT; continue;
13777 case '0': flags |= F_ZERO; continue;
13778 }
13779 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013780 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013781 if (c == '*') {
13782 v = getnextarg(args, arglen, &argidx);
13783 if (v == NULL)
13784 goto onError;
13785 if (!PyLong_Check(v)) {
13786 PyErr_SetString(PyExc_TypeError,
13787 "* wants int");
13788 goto onError;
13789 }
13790 width = PyLong_AsLong(v);
13791 if (width == -1 && PyErr_Occurred())
13792 goto onError;
13793 if (width < 0) {
13794 flags |= F_LJUST;
13795 width = -width;
13796 }
13797 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013798 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013799 }
13800 else if (c >= '0' && c <= '9') {
13801 width = c - '0';
13802 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013803 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013804 if (c < '0' || c > '9')
13805 break;
13806 if ((width*10) / 10 != width) {
13807 PyErr_SetString(PyExc_ValueError,
13808 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013809 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013810 }
13811 width = width*10 + (c - '0');
13812 }
13813 }
13814 if (c == '.') {
13815 prec = 0;
13816 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013817 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013818 if (c == '*') {
13819 v = getnextarg(args, arglen, &argidx);
13820 if (v == NULL)
13821 goto onError;
13822 if (!PyLong_Check(v)) {
13823 PyErr_SetString(PyExc_TypeError,
13824 "* wants int");
13825 goto onError;
13826 }
13827 prec = PyLong_AsLong(v);
13828 if (prec == -1 && PyErr_Occurred())
13829 goto onError;
13830 if (prec < 0)
13831 prec = 0;
13832 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013833 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013834 }
13835 else if (c >= '0' && c <= '9') {
13836 prec = c - '0';
13837 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013838 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013839 if (c < '0' || c > '9')
13840 break;
13841 if ((prec*10) / 10 != prec) {
13842 PyErr_SetString(PyExc_ValueError,
13843 "prec too big");
13844 goto onError;
13845 }
13846 prec = prec*10 + (c - '0');
13847 }
13848 }
13849 } /* prec */
13850 if (fmtcnt >= 0) {
13851 if (c == 'h' || c == 'l' || c == 'L') {
13852 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013853 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013854 }
13855 }
13856 if (fmtcnt < 0) {
13857 PyErr_SetString(PyExc_ValueError,
13858 "incomplete format");
13859 goto onError;
13860 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020013861
13862 if (c == '%') {
13863 _PyAccu_Accumulate(&acc, percent);
13864 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013865 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020013866
13867
13868 v = getnextarg(args, arglen, &argidx);
13869 if (v == NULL)
13870 goto onError;
13871
Benjamin Peterson29060642009-01-31 22:14:21 +000013872 sign = 0;
13873 fill = ' ';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013874 fillobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013875 switch (c) {
13876
Benjamin Peterson29060642009-01-31 22:14:21 +000013877 case 's':
13878 case 'r':
13879 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000013880 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013881 temp = v;
13882 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013883 }
13884 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013885 if (c == 's')
13886 temp = PyObject_Str(v);
13887 else if (c == 'r')
13888 temp = PyObject_Repr(v);
13889 else
13890 temp = PyObject_ASCII(v);
Benjamin Peterson29060642009-01-31 22:14:21 +000013891 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013892 break;
13893
13894 case 'i':
13895 case 'd':
13896 case 'u':
13897 case 'o':
13898 case 'x':
13899 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000013900 isnumok = 0;
13901 if (PyNumber_Check(v)) {
13902 PyObject *iobj=NULL;
13903
13904 if (PyLong_Check(v)) {
13905 iobj = v;
13906 Py_INCREF(iobj);
13907 }
13908 else {
13909 iobj = PyNumber_Long(v);
13910 }
13911 if (iobj!=NULL) {
13912 if (PyLong_Check(iobj)) {
13913 isnumok = 1;
Victor Stinneraff3cc62012-04-30 05:19:21 +020013914 sign = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013915 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013916 Py_DECREF(iobj);
Benjamin Peterson29060642009-01-31 22:14:21 +000013917 }
13918 else {
13919 Py_DECREF(iobj);
13920 }
13921 }
13922 }
13923 if (!isnumok) {
13924 PyErr_Format(PyExc_TypeError,
13925 "%%%c format: a number is required, "
13926 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13927 goto onError;
13928 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013929 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013930 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013931 fillobj = zero;
13932 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013933 break;
13934
13935 case 'e':
13936 case 'E':
13937 case 'f':
13938 case 'F':
13939 case 'g':
13940 case 'G':
Benjamin Peterson29060642009-01-31 22:14:21 +000013941 sign = 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013942 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013943 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013944 fillobj = zero;
13945 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020013946 temp = formatfloat(v, flags, prec, c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013947 break;
13948
13949 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013950 {
13951 Py_UCS4 ch = formatchar(v);
13952 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013953 goto onError;
Victor Stinnerb5c3ea32012-05-02 00:29:36 +020013954 temp = PyUnicode_FromOrdinal(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +000013955 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013956 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013957
13958 default:
13959 PyErr_Format(PyExc_ValueError,
13960 "unsupported format character '%c' (0x%x) "
13961 "at index %zd",
13962 (31<=c && c<=126) ? (char)c : '?',
13963 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013964 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013965 goto onError;
13966 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020013967 if (temp == NULL)
13968 goto onError;
13969 assert (PyUnicode_Check(temp));
13970 if (PyUnicode_READY(temp) == -1) {
13971 Py_CLEAR(temp);
13972 goto onError;
13973 }
13974 kind = PyUnicode_KIND(temp);
13975 pbuf = PyUnicode_DATA(temp);
13976 len = PyUnicode_GET_LENGTH(temp);
13977
13978 if (c == 's' || c == 'r' || c == 'a') {
13979 if (prec >= 0 && len > prec)
13980 len = prec;
13981 }
13982
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013983 /* pbuf is initialized here. */
13984 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013985 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013986 if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
13987 signobj = minus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013988 len--;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013989 pindex++;
13990 }
13991 else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
13992 signobj = plus;
13993 len--;
13994 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013995 }
13996 else if (flags & F_SIGN)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013997 signobj = plus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013998 else if (flags & F_BLANK)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013999 signobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000014000 else
14001 sign = 0;
14002 }
14003 if (width < len)
14004 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000014005 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014006 if (fill != ' ') {
14007 assert(signobj != NULL);
14008 if (_PyAccu_Accumulate(&acc, signobj))
14009 goto onError;
14010 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014011 if (width > len)
14012 width--;
14013 }
14014 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014015 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014016 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000014017 if (fill != ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014018 second = get_latin1_char(
14019 PyUnicode_READ(kind, pbuf, pindex + 1));
14020 pindex += 2;
14021 if (second == NULL ||
14022 _PyAccu_Accumulate(&acc, zero) ||
14023 _PyAccu_Accumulate(&acc, second))
14024 goto onError;
14025 Py_CLEAR(second);
Benjamin Peterson29060642009-01-31 22:14:21 +000014026 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014027 width -= 2;
14028 if (width < 0)
14029 width = 0;
14030 len -= 2;
14031 }
14032 if (width > len && !(flags & F_LJUST)) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014033 assert(fillobj != NULL);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020014034 if (repeat_accumulate(&acc, fillobj, width - len))
14035 goto onError;
14036 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000014037 }
14038 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014039 if (sign) {
14040 assert(signobj != NULL);
14041 if (_PyAccu_Accumulate(&acc, signobj))
14042 goto onError;
14043 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014044 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014045 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14046 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014047 second = get_latin1_char(
14048 PyUnicode_READ(kind, pbuf, pindex + 1));
14049 pindex += 2;
14050 if (second == NULL ||
14051 _PyAccu_Accumulate(&acc, zero) ||
14052 _PyAccu_Accumulate(&acc, second))
14053 goto onError;
14054 Py_CLEAR(second);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014055 }
14056 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014057 /* Copy all characters, preserving len */
Victor Stinnerb80e46e2012-04-30 05:21:52 +020014058 if (pindex == 0 && len == PyUnicode_GET_LENGTH(temp)) {
14059 r = _PyAccu_Accumulate(&acc, temp);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014060 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014061 else {
Victor Stinnerb80e46e2012-04-30 05:21:52 +020014062 v = PyUnicode_Substring(temp, pindex, pindex + len);
14063 if (v == NULL)
14064 goto onError;
14065 r = _PyAccu_Accumulate(&acc, v);
14066 Py_DECREF(v);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014067 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014068 if (r)
14069 goto onError;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020014070 if (width > len && repeat_accumulate(&acc, blank, width - len))
14071 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000014072 if (dict && (argidx < arglen) && c != '%') {
14073 PyErr_SetString(PyExc_TypeError,
14074 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000014075 goto onError;
14076 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014077 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000014078 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014079 } /* until end */
14080 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014081 PyErr_SetString(PyExc_TypeError,
14082 "not all arguments converted during string formatting");
14083 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014084 }
14085
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014086 result = _PyAccu_Finish(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014087 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014088 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014089 }
14090 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014091 Py_XDECREF(temp);
14092 Py_XDECREF(second);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014093 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014094
Benjamin Peterson29060642009-01-31 22:14:21 +000014095 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014096 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014097 Py_XDECREF(temp);
14098 Py_XDECREF(second);
14099 _PyAccu_Destroy(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014100 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014101 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014102 }
14103 return NULL;
14104}
14105
Jeremy Hylton938ace62002-07-17 16:30:39 +000014106static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014107unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14108
Tim Peters6d6c1a32001-08-02 04:15:00 +000014109static PyObject *
14110unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14111{
Benjamin Peterson29060642009-01-31 22:14:21 +000014112 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014113 static char *kwlist[] = {"object", "encoding", "errors", 0};
14114 char *encoding = NULL;
14115 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014116
Benjamin Peterson14339b62009-01-31 16:36:08 +000014117 if (type != &PyUnicode_Type)
14118 return unicode_subtype_new(type, args, kwds);
14119 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014120 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014121 return NULL;
Victor Stinner382955f2011-12-11 21:44:00 +010014122 if (x == NULL) {
14123 Py_INCREF(unicode_empty);
14124 return unicode_empty;
14125 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014126 if (encoding == NULL && errors == NULL)
14127 return PyObject_Str(x);
14128 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014129 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014130}
14131
Guido van Rossume023fe02001-08-30 03:12:59 +000014132static PyObject *
14133unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14134{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014135 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014136 Py_ssize_t length, char_size;
14137 int share_wstr, share_utf8;
14138 unsigned int kind;
14139 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014140
Benjamin Peterson14339b62009-01-31 16:36:08 +000014141 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014142
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014143 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014144 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014145 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014146 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014147 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014148 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014149 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014150 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014151
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014152 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014153 if (self == NULL) {
14154 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014155 return NULL;
14156 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014157 kind = PyUnicode_KIND(unicode);
14158 length = PyUnicode_GET_LENGTH(unicode);
14159
14160 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014161#ifdef Py_DEBUG
14162 _PyUnicode_HASH(self) = -1;
14163#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014164 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014165#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014166 _PyUnicode_STATE(self).interned = 0;
14167 _PyUnicode_STATE(self).kind = kind;
14168 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014169 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014170 _PyUnicode_STATE(self).ready = 1;
14171 _PyUnicode_WSTR(self) = NULL;
14172 _PyUnicode_UTF8_LENGTH(self) = 0;
14173 _PyUnicode_UTF8(self) = NULL;
14174 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014175 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014176
14177 share_utf8 = 0;
14178 share_wstr = 0;
14179 if (kind == PyUnicode_1BYTE_KIND) {
14180 char_size = 1;
14181 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14182 share_utf8 = 1;
14183 }
14184 else if (kind == PyUnicode_2BYTE_KIND) {
14185 char_size = 2;
14186 if (sizeof(wchar_t) == 2)
14187 share_wstr = 1;
14188 }
14189 else {
14190 assert(kind == PyUnicode_4BYTE_KIND);
14191 char_size = 4;
14192 if (sizeof(wchar_t) == 4)
14193 share_wstr = 1;
14194 }
14195
14196 /* Ensure we won't overflow the length. */
14197 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14198 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014199 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014200 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014201 data = PyObject_MALLOC((length + 1) * char_size);
14202 if (data == NULL) {
14203 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014204 goto onError;
14205 }
14206
Victor Stinnerc3c74152011-10-02 20:39:55 +020014207 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014208 if (share_utf8) {
14209 _PyUnicode_UTF8_LENGTH(self) = length;
14210 _PyUnicode_UTF8(self) = data;
14211 }
14212 if (share_wstr) {
14213 _PyUnicode_WSTR_LENGTH(self) = length;
14214 _PyUnicode_WSTR(self) = (wchar_t *)data;
14215 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014216
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014217 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014218 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014219 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014220#ifdef Py_DEBUG
14221 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14222#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014223 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014224 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014225
14226onError:
14227 Py_DECREF(unicode);
14228 Py_DECREF(self);
14229 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014230}
14231
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014232PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000014233 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014234\n\
Collin Winterd474ce82007-08-07 19:42:11 +000014235Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000014236encoding defaults to the current default string encoding.\n\
14237errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014238
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014239static PyObject *unicode_iter(PyObject *seq);
14240
Guido van Rossumd57fd912000-03-10 22:53:23 +000014241PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014242 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014243 "str", /* tp_name */
14244 sizeof(PyUnicodeObject), /* tp_size */
14245 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014246 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014247 (destructor)unicode_dealloc, /* tp_dealloc */
14248 0, /* tp_print */
14249 0, /* tp_getattr */
14250 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014251 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014252 unicode_repr, /* tp_repr */
14253 &unicode_as_number, /* tp_as_number */
14254 &unicode_as_sequence, /* tp_as_sequence */
14255 &unicode_as_mapping, /* tp_as_mapping */
14256 (hashfunc) unicode_hash, /* tp_hash*/
14257 0, /* tp_call*/
14258 (reprfunc) unicode_str, /* tp_str */
14259 PyObject_GenericGetAttr, /* tp_getattro */
14260 0, /* tp_setattro */
14261 0, /* tp_as_buffer */
14262 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014263 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014264 unicode_doc, /* tp_doc */
14265 0, /* tp_traverse */
14266 0, /* tp_clear */
14267 PyUnicode_RichCompare, /* tp_richcompare */
14268 0, /* tp_weaklistoffset */
14269 unicode_iter, /* tp_iter */
14270 0, /* tp_iternext */
14271 unicode_methods, /* tp_methods */
14272 0, /* tp_members */
14273 0, /* tp_getset */
14274 &PyBaseObject_Type, /* tp_base */
14275 0, /* tp_dict */
14276 0, /* tp_descr_get */
14277 0, /* tp_descr_set */
14278 0, /* tp_dictoffset */
14279 0, /* tp_init */
14280 0, /* tp_alloc */
14281 unicode_new, /* tp_new */
14282 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014283};
14284
14285/* Initialize the Unicode implementation */
14286
Victor Stinner3a50e702011-10-18 21:21:00 +020014287int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014288{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014289 int i;
14290
Thomas Wouters477c8d52006-05-27 19:21:47 +000014291 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014292 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014293 0x000A, /* LINE FEED */
14294 0x000D, /* CARRIAGE RETURN */
14295 0x001C, /* FILE SEPARATOR */
14296 0x001D, /* GROUP SEPARATOR */
14297 0x001E, /* RECORD SEPARATOR */
14298 0x0085, /* NEXT LINE */
14299 0x2028, /* LINE SEPARATOR */
14300 0x2029, /* PARAGRAPH SEPARATOR */
14301 };
14302
Fred Drakee4315f52000-05-09 19:53:39 +000014303 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020014304 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014305 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014306 Py_FatalError("Can't create empty string");
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010014307 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014308
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014309 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000014310 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000014311 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014312 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014313
14314 /* initialize the linebreak bloom filter */
14315 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014316 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014317 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014318
14319 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020014320
14321#ifdef HAVE_MBCS
14322 winver.dwOSVersionInfoSize = sizeof(winver);
14323 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14324 PyErr_SetFromWindowsErr(0);
14325 return -1;
14326 }
14327#endif
14328 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014329}
14330
14331/* Finalize the Unicode implementation */
14332
Christian Heimesa156e092008-02-16 07:38:31 +000014333int
14334PyUnicode_ClearFreeList(void)
14335{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014336 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014337}
14338
Guido van Rossumd57fd912000-03-10 22:53:23 +000014339void
Thomas Wouters78890102000-07-22 19:25:51 +000014340_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014341{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014342 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014343
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000014344 Py_XDECREF(unicode_empty);
14345 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014346
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014347 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014348 if (unicode_latin1[i]) {
14349 Py_DECREF(unicode_latin1[i]);
14350 unicode_latin1[i] = NULL;
14351 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014352 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014353 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014354 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014355}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014356
Walter Dörwald16807132007-05-25 13:52:07 +000014357void
14358PyUnicode_InternInPlace(PyObject **p)
14359{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014360 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014361 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014362#ifdef Py_DEBUG
14363 assert(s != NULL);
14364 assert(_PyUnicode_CHECK(s));
14365#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014366 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014367 return;
14368#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014369 /* If it's a subclass, we don't really know what putting
14370 it in the interned dict might do. */
14371 if (!PyUnicode_CheckExact(s))
14372 return;
14373 if (PyUnicode_CHECK_INTERNED(s))
14374 return;
14375 if (interned == NULL) {
14376 interned = PyDict_New();
14377 if (interned == NULL) {
14378 PyErr_Clear(); /* Don't leave an exception */
14379 return;
14380 }
14381 }
14382 /* It might be that the GetItem call fails even
14383 though the key is present in the dictionary,
14384 namely when this happens during a stack overflow. */
14385 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014386 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014387 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014388
Benjamin Peterson29060642009-01-31 22:14:21 +000014389 if (t) {
14390 Py_INCREF(t);
14391 Py_DECREF(*p);
14392 *p = t;
14393 return;
14394 }
Walter Dörwald16807132007-05-25 13:52:07 +000014395
Benjamin Peterson14339b62009-01-31 16:36:08 +000014396 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010014397 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014398 PyErr_Clear();
14399 PyThreadState_GET()->recursion_critical = 0;
14400 return;
14401 }
14402 PyThreadState_GET()->recursion_critical = 0;
14403 /* The two references in interned are not counted by refcnt.
14404 The deallocator will take care of this */
14405 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014406 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014407}
14408
14409void
14410PyUnicode_InternImmortal(PyObject **p)
14411{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014412 PyUnicode_InternInPlace(p);
14413 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014414 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014415 Py_INCREF(*p);
14416 }
Walter Dörwald16807132007-05-25 13:52:07 +000014417}
14418
14419PyObject *
14420PyUnicode_InternFromString(const char *cp)
14421{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014422 PyObject *s = PyUnicode_FromString(cp);
14423 if (s == NULL)
14424 return NULL;
14425 PyUnicode_InternInPlace(&s);
14426 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014427}
14428
Alexander Belopolsky40018472011-02-26 01:02:56 +000014429void
14430_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014431{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014432 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014433 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014434 Py_ssize_t i, n;
14435 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014436
Benjamin Peterson14339b62009-01-31 16:36:08 +000014437 if (interned == NULL || !PyDict_Check(interned))
14438 return;
14439 keys = PyDict_Keys(interned);
14440 if (keys == NULL || !PyList_Check(keys)) {
14441 PyErr_Clear();
14442 return;
14443 }
Walter Dörwald16807132007-05-25 13:52:07 +000014444
Benjamin Peterson14339b62009-01-31 16:36:08 +000014445 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14446 detector, interned unicode strings are not forcibly deallocated;
14447 rather, we give them their stolen references back, and then clear
14448 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014449
Benjamin Peterson14339b62009-01-31 16:36:08 +000014450 n = PyList_GET_SIZE(keys);
14451 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014452 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014453 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014454 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014455 if (PyUnicode_READY(s) == -1) {
14456 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014457 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014458 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014459 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014460 case SSTATE_NOT_INTERNED:
14461 /* XXX Shouldn't happen */
14462 break;
14463 case SSTATE_INTERNED_IMMORTAL:
14464 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014465 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014466 break;
14467 case SSTATE_INTERNED_MORTAL:
14468 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014469 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014470 break;
14471 default:
14472 Py_FatalError("Inconsistent interned string state.");
14473 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014474 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014475 }
14476 fprintf(stderr, "total size of all interned strings: "
14477 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14478 "mortal/immortal\n", mortal_size, immortal_size);
14479 Py_DECREF(keys);
14480 PyDict_Clear(interned);
14481 Py_DECREF(interned);
14482 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000014483}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014484
14485
14486/********************* Unicode Iterator **************************/
14487
14488typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014489 PyObject_HEAD
14490 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014491 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014492} unicodeiterobject;
14493
14494static void
14495unicodeiter_dealloc(unicodeiterobject *it)
14496{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014497 _PyObject_GC_UNTRACK(it);
14498 Py_XDECREF(it->it_seq);
14499 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014500}
14501
14502static int
14503unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14504{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014505 Py_VISIT(it->it_seq);
14506 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014507}
14508
14509static PyObject *
14510unicodeiter_next(unicodeiterobject *it)
14511{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014512 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014513
Benjamin Peterson14339b62009-01-31 16:36:08 +000014514 assert(it != NULL);
14515 seq = it->it_seq;
14516 if (seq == NULL)
14517 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014518 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014519
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014520 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14521 int kind = PyUnicode_KIND(seq);
14522 void *data = PyUnicode_DATA(seq);
14523 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14524 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014525 if (item != NULL)
14526 ++it->it_index;
14527 return item;
14528 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014529
Benjamin Peterson14339b62009-01-31 16:36:08 +000014530 Py_DECREF(seq);
14531 it->it_seq = NULL;
14532 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014533}
14534
14535static PyObject *
14536unicodeiter_len(unicodeiterobject *it)
14537{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014538 Py_ssize_t len = 0;
14539 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014540 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014541 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014542}
14543
14544PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14545
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014546static PyObject *
14547unicodeiter_reduce(unicodeiterobject *it)
14548{
14549 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020014550 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014551 it->it_seq, it->it_index);
14552 } else {
14553 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
14554 if (u == NULL)
14555 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020014556 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014557 }
14558}
14559
14560PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14561
14562static PyObject *
14563unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
14564{
14565 Py_ssize_t index = PyLong_AsSsize_t(state);
14566 if (index == -1 && PyErr_Occurred())
14567 return NULL;
14568 if (index < 0)
14569 index = 0;
14570 it->it_index = index;
14571 Py_RETURN_NONE;
14572}
14573
14574PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14575
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014576static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014577 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014578 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014579 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
14580 reduce_doc},
14581 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
14582 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014583 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014584};
14585
14586PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014587 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14588 "str_iterator", /* tp_name */
14589 sizeof(unicodeiterobject), /* tp_basicsize */
14590 0, /* tp_itemsize */
14591 /* methods */
14592 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14593 0, /* tp_print */
14594 0, /* tp_getattr */
14595 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014596 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014597 0, /* tp_repr */
14598 0, /* tp_as_number */
14599 0, /* tp_as_sequence */
14600 0, /* tp_as_mapping */
14601 0, /* tp_hash */
14602 0, /* tp_call */
14603 0, /* tp_str */
14604 PyObject_GenericGetAttr, /* tp_getattro */
14605 0, /* tp_setattro */
14606 0, /* tp_as_buffer */
14607 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14608 0, /* tp_doc */
14609 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14610 0, /* tp_clear */
14611 0, /* tp_richcompare */
14612 0, /* tp_weaklistoffset */
14613 PyObject_SelfIter, /* tp_iter */
14614 (iternextfunc)unicodeiter_next, /* tp_iternext */
14615 unicodeiter_methods, /* tp_methods */
14616 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014617};
14618
14619static PyObject *
14620unicode_iter(PyObject *seq)
14621{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014622 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014623
Benjamin Peterson14339b62009-01-31 16:36:08 +000014624 if (!PyUnicode_Check(seq)) {
14625 PyErr_BadInternalCall();
14626 return NULL;
14627 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014628 if (PyUnicode_READY(seq) == -1)
14629 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014630 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14631 if (it == NULL)
14632 return NULL;
14633 it->it_index = 0;
14634 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014635 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014636 _PyObject_GC_TRACK(it);
14637 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014638}
14639
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014640
14641size_t
14642Py_UNICODE_strlen(const Py_UNICODE *u)
14643{
14644 int res = 0;
14645 while(*u++)
14646 res++;
14647 return res;
14648}
14649
14650Py_UNICODE*
14651Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14652{
14653 Py_UNICODE *u = s1;
14654 while ((*u++ = *s2++));
14655 return s1;
14656}
14657
14658Py_UNICODE*
14659Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14660{
14661 Py_UNICODE *u = s1;
14662 while ((*u++ = *s2++))
14663 if (n-- == 0)
14664 break;
14665 return s1;
14666}
14667
14668Py_UNICODE*
14669Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14670{
14671 Py_UNICODE *u1 = s1;
14672 u1 += Py_UNICODE_strlen(u1);
14673 Py_UNICODE_strcpy(u1, s2);
14674 return s1;
14675}
14676
14677int
14678Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14679{
14680 while (*s1 && *s2 && *s1 == *s2)
14681 s1++, s2++;
14682 if (*s1 && *s2)
14683 return (*s1 < *s2) ? -1 : +1;
14684 if (*s1)
14685 return 1;
14686 if (*s2)
14687 return -1;
14688 return 0;
14689}
14690
14691int
14692Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14693{
14694 register Py_UNICODE u1, u2;
14695 for (; n != 0; n--) {
14696 u1 = *s1;
14697 u2 = *s2;
14698 if (u1 != u2)
14699 return (u1 < u2) ? -1 : +1;
14700 if (u1 == '\0')
14701 return 0;
14702 s1++;
14703 s2++;
14704 }
14705 return 0;
14706}
14707
14708Py_UNICODE*
14709Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14710{
14711 const Py_UNICODE *p;
14712 for (p = s; *p; p++)
14713 if (*p == c)
14714 return (Py_UNICODE*)p;
14715 return NULL;
14716}
14717
14718Py_UNICODE*
14719Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14720{
14721 const Py_UNICODE *p;
14722 p = s + Py_UNICODE_strlen(s);
14723 while (p != s) {
14724 p--;
14725 if (*p == c)
14726 return (Py_UNICODE*)p;
14727 }
14728 return NULL;
14729}
Victor Stinner331ea922010-08-10 16:37:20 +000014730
Victor Stinner71133ff2010-09-01 23:43:53 +000014731Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014732PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014733{
Victor Stinner577db2c2011-10-11 22:12:48 +020014734 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014735 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014736
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014737 if (!PyUnicode_Check(unicode)) {
14738 PyErr_BadArgument();
14739 return NULL;
14740 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014741 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014742 if (u == NULL)
14743 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014744 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014745 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014746 PyErr_NoMemory();
14747 return NULL;
14748 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014749 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014750 size *= sizeof(Py_UNICODE);
14751 copy = PyMem_Malloc(size);
14752 if (copy == NULL) {
14753 PyErr_NoMemory();
14754 return NULL;
14755 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014756 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014757 return copy;
14758}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014759
Georg Brandl66c221e2010-10-14 07:04:07 +000014760/* A _string module, to export formatter_parser and formatter_field_name_split
14761 to the string.Formatter class implemented in Python. */
14762
14763static PyMethodDef _string_methods[] = {
14764 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14765 METH_O, PyDoc_STR("split the argument as a field name")},
14766 {"formatter_parser", (PyCFunction) formatter_parser,
14767 METH_O, PyDoc_STR("parse the argument as a format string")},
14768 {NULL, NULL}
14769};
14770
14771static struct PyModuleDef _string_module = {
14772 PyModuleDef_HEAD_INIT,
14773 "_string",
14774 PyDoc_STR("string helper module"),
14775 0,
14776 _string_methods,
14777 NULL,
14778 NULL,
14779 NULL,
14780 NULL
14781};
14782
14783PyMODINIT_FUNC
14784PyInit__string(void)
14785{
14786 return PyModule_Create(&_string_module);
14787}
14788
14789
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014790#ifdef __cplusplus
14791}
14792#endif